kiln_ai.datamodel.eval

  1import json
  2from enum import Enum
  3from threading import Lock
  4from typing import TYPE_CHECKING, Any, Dict, List, Union
  5
  6from pydantic import BaseModel, Field, model_validator
  7from typing_extensions import Self
  8
  9from kiln_ai.datamodel.basemodel import (
 10    ID_TYPE,
 11    FilenameString,
 12    FilenameStringShort,
 13    KilnParentedModel,
 14    KilnParentModel,
 15)
 16from kiln_ai.datamodel.datamodel_enums import TaskOutputRatingType
 17from kiln_ai.datamodel.dataset_filters import DatasetFilterId
 18from kiln_ai.datamodel.json_schema import string_to_json_key
 19from kiln_ai.datamodel.task_run import Usage
 20from kiln_ai.utils.exhaustive_error import raise_exhaustive_enum_error
 21
 22if TYPE_CHECKING:
 23    from kiln_ai.datamodel.spec import Spec
 24    from kiln_ai.datamodel.task import Task
 25
 26EvalScores = Dict[str, float]
 27
 28# Module-level set to track evals currently being migrated (to prevent recursion)
 29# Protected by _migration_lock to ensure thread-safe access
 30_migration_lock = Lock()
 31_currently_migrating_eval_ids: set[ID_TYPE] = set()
 32
 33
 34class EvalTemplateId(str, Enum):
 35    """
 36    An eval template is a pre-defined eval that can be used as a starting point for a new eval.
 37    """
 38
 39    kiln_requirements = "kiln_requirements"
 40    desired_behaviour = "desired_behaviour"
 41    issue = "kiln_issue"
 42    tool_call = "tool_call"
 43    toxicity = "toxicity"
 44    bias = "bias"
 45    maliciousness = "maliciousness"
 46    factual_correctness = "factual_correctness"
 47    jailbreak = "jailbreak"
 48    rag = "rag"
 49
 50
 51class EvalConfigType(str, Enum):
 52    g_eval = "g_eval"
 53    llm_as_judge = "llm_as_judge"
 54
 55
 56class EvalOutputScore(BaseModel):
 57    """
 58    A definition of a score that an evaluator will produce.
 59
 60    Very similar to TaskRequirement, but conceptually different keeping in a separate models.
 61    """
 62
 63    name: FilenameStringShort = Field(
 64        description="The name of the score. Will be provided to the model so use a descriptive name. Should align to the model's TaskRequirement name if you want to use human evals to evaluate the evaluator's performance."
 65    )
 66    instruction: str | None = Field(
 67        default=None,
 68        description="A description of the score, used to help the model understand the goal of the score. Will be provided to evaluator models, so should be written for the model, not the team/user.",
 69    )
 70    type: TaskOutputRatingType = Field(
 71        description="The type of rating to use ('five_star', 'pass_fail', 'pass_fail_critical').",
 72    )
 73
 74    def json_key(self) -> str:
 75        """
 76        The JSON key for the score, used when running the evaluator with a LLM and we need JSON output.
 77
 78        For example, "Overall Rating" -> "overall_rating"
 79        """
 80        return string_to_json_key(self.name)
 81
 82    @model_validator(mode="after")
 83    def validate_type(self) -> Self:
 84        if self.type == TaskOutputRatingType.custom:
 85            raise ValueError(
 86                f"Custom scores are not supported in evaluators. Score '{self.name}' was set to a custom score."
 87            )
 88        return self
 89
 90
 91class EvalRun(KilnParentedModel):
 92    """
 93    The results of running an eval on a single dataset item.
 94
 95    This is a child of an EvalConfig, which specifies how the scores were generated.
 96
 97    Eval runs can be one of 2 types:
 98    1) eval_config_eval=False: we were evaluating a task run (a method of running the task). We get the task input from the dataset_id.input, run the task with the task_run_config, then ran the evaluator on that output. task_run_config_id must be set. The output saved in this model is the output of the task run.
 99    2) eval_config_eval=True: we were evaluating an eval config (a method of evaluating the task). We used the existing dataset item input/output, and ran the evaluator on it. task_run_config_id must be None. The input/output saved in this model is the input/output of the dataset item.
100    """
101
102    dataset_id: ID_TYPE = Field(
103        description="The ID of the dataset item that was used for this run. Must belong to the same Task as the grand-parent eval of this EvalRun."
104    )
105    task_run_config_id: ID_TYPE | None = Field(
106        description="The ID of the TaskRunConfig that was run, if this eval run was based on a task run. Must belong to the same Task as this eval. Can be None if this eval run is based on an eval config."
107    )
108    eval_config_eval: bool = Field(
109        description="Whether this eval run to evaluate the parent eval config (evaluating the config using an existing dataset item). If true, task_run_config_id must be None, as we're not running the task.",
110        default=False,
111    )
112    # These two may duplicate the dataset_id.input/output, but we're denormalizing intentionally.
113    input: str = Field(
114        description="The input to the task. JSON formatted for structured input, plaintext for unstructured input."
115    )
116    output: str = Field(
117        description="The output of the task. JSON formatted for structured output, plaintext for unstructured output."
118    )
119    reference_answer: str | None = Field(
120        default=None,
121        description="The reference answer for the input. JSON formatted for structured reference answer, plaintext for unstructured reference answer. Used for reference answer evals.",
122    )
123    intermediate_outputs: Dict[str, str] | None = Field(
124        default=None,
125        description="The intermediate outputs of the task (example, eval thinking).",
126    )
127    task_run_trace: str | None = Field(
128        default=None,
129        description="The JSON formatted trace of the task run that produced the output.",
130    )
131    scores: EvalScores = Field(
132        description="The output scores of the evaluator (aligning to those required by the grand-parent Eval this object is a child of)."
133    )
134    task_run_usage: Usage | None = Field(
135        default=None,
136        description="The usage of the task run that produced this eval run output (not the usage by the evaluation model).",
137    )
138
139    def parent_eval_config(self) -> Union["EvalConfig", None]:
140        if self.parent is not None and self.parent.__class__.__name__ != "EvalConfig":
141            raise ValueError("parent must be an EvalConfig")
142        return self.parent  # type: ignore
143
144    @model_validator(mode="after")
145    def validate_output_fields(self) -> Self:
146        parent_eval_config = self.parent_eval_config()
147        parent_eval = parent_eval_config.parent_eval() if parent_eval_config else None
148        if not parent_eval:
149            return self
150
151        evaluation_data_type = parent_eval.evaluation_data_type
152        if (
153            evaluation_data_type == EvalDataType.final_answer
154            and self.task_run_trace is not None
155        ):
156            raise ValueError("final_answer runs should not set trace")
157        elif (
158            not self.eval_config_eval
159            and evaluation_data_type == EvalDataType.full_trace
160            and self.task_run_trace is None
161        ):
162            raise ValueError("full_trace task run eval runs should include trace")
163
164        return self
165
166    @model_validator(mode="after")
167    def validate_eval_run_types(self) -> Self:
168        if self.eval_config_eval and self.task_run_config_id is not None:
169            raise ValueError(
170                "task_run_config_id must be None if eval_config_eval is true"
171            )
172        if not self.eval_config_eval and self.task_run_config_id is None:
173            raise ValueError(
174                "task_run_config_id must be set if eval_config_eval is false"
175            )
176        return self
177
178    @model_validator(mode="after")
179    def validate_scores(self) -> Self:
180        # We're checking the scores have the expected keys from the grand-parent eval
181        if self.scores is None or len(self.scores) == 0:
182            raise ValueError("scores are required, and must have at least one score.")
183
184        parent_eval_config = self.parent_eval_config()
185        eval = parent_eval_config.parent_eval() if parent_eval_config else None
186        if not eval:
187            # Can't validate without the grand-parent eval, allow it to be validated later
188            return self
189
190        output_score_keys = [score.json_key() for score in eval.output_scores]
191        if set(output_score_keys) != set(self.scores.keys()):
192            raise ValueError(
193                f"The scores produced by the evaluator must match the scores expected by the eval. Got: [{', '.join(self.scores.keys())}] and expected: [{', '.join(output_score_keys)}]"
194            )
195
196        # Check that each score is expected in this eval and the correct type
197        for output_score in eval.output_scores:
198            match output_score.type:
199                case TaskOutputRatingType.five_star:
200                    five_star_score = self.scores[output_score.json_key()]
201                    if (
202                        not isinstance(five_star_score, float)
203                        or five_star_score < 1.0
204                        or five_star_score > 5.0
205                    ):
206                        raise ValueError(
207                            f"Score {output_score.name} is a five_star rating and must be a float between 1.0 and 5.0 inclusive. Got: {five_star_score}"
208                        )
209                case TaskOutputRatingType.pass_fail:
210                    pass_fail_score = self.scores[output_score.json_key()]
211                    if (
212                        not isinstance(pass_fail_score, float)
213                        or pass_fail_score < 0.0
214                        or pass_fail_score > 1.0
215                    ):
216                        raise ValueError(
217                            f"Score {output_score.name} is a pass_fail rating and must be a float between 0.0 and 1.0 inclusive. Got: {pass_fail_score}"
218                        )
219                case TaskOutputRatingType.pass_fail_critical:
220                    pass_fail_critical_score = self.scores[output_score.json_key()]
221                    if (
222                        not isinstance(pass_fail_critical_score, float)
223                        or pass_fail_critical_score < -1.0
224                        or pass_fail_critical_score > 1.0
225                    ):
226                        raise ValueError(
227                            f"Score {output_score.name} is a pass_fail_critical rating and must be a float between -1.0 and 1.0 inclusive. Got: {pass_fail_critical_score}"
228                        )
229                case TaskOutputRatingType.custom:
230                    raise ValueError(
231                        f"Custom scores are not supported in evaluators. '{output_score.name}' was set to a custom score."
232                    )
233                case _:
234                    # Catch missing cases
235                    raise_exhaustive_enum_error(output_score.type)
236        return self
237
238    @model_validator(mode="after")
239    def validate_reference_answer(self) -> Self:
240        parent_eval_config = self.parent_eval_config()
241        parent_eval = parent_eval_config.parent_eval() if parent_eval_config else None
242        if not parent_eval:
243            # Can't validate without the grand-parent eval, allow it to be validated later
244            return self
245
246        evaluation_data_type = parent_eval.evaluation_data_type
247        if (
248            self.reference_answer is not None
249            and evaluation_data_type != EvalDataType.reference_answer
250        ):
251            raise ValueError(
252                f"reference_answer is only valid for reference answer evals. Got: {evaluation_data_type.value}"
253            )
254        return self
255
256
257class EvalConfig(KilnParentedModel, KilnParentModel, parent_of={"runs": EvalRun}):
258    """
259    A configuration for running an eval. This includes anything needed to run the eval on a dataset like the prompt, model, thresholds, etc.
260
261    A eval might have many configs, example running the same eval with 2 different models. Comparing eval results is only valid within the scope of the same config.
262    """
263
264    name: FilenameString = Field(description="The name of the eval config.")
265    model_name: str = Field(
266        description="The name of the model to use for this eval config. ",
267    )
268    model_provider: str = Field(
269        description="The provider of the model to use for this eval config.",
270    )
271    config_type: EvalConfigType = Field(
272        default=EvalConfigType.g_eval,
273        description="This is used to determine the type of eval to run.",
274    )
275    properties: dict[str, Any] = Field(
276        default={},
277        description="Properties to be used to execute the eval config. This is config_type specific and should serialize to a json dict.",
278    )
279
280    def parent_eval(self) -> Union["Eval", None]:
281        if self.parent is not None and self.parent.__class__.__name__ != "Eval":
282            raise ValueError("parent must be an Eval")
283        return self.parent  # type: ignore
284
285    def runs(self, readonly: bool = False) -> list[EvalRun]:
286        return super().runs(readonly=readonly)  # type: ignore
287
288    @model_validator(mode="after")
289    def validate_properties(self) -> Self:
290        if (
291            self.config_type == EvalConfigType.g_eval
292            or self.config_type == EvalConfigType.llm_as_judge
293        ):
294            if "eval_steps" not in self.properties or not isinstance(
295                self.properties["eval_steps"], list
296            ):
297                raise ValueError("eval_steps is required and must be a list for g_eval")
298            if "task_description" in self.properties and not isinstance(
299                self.properties["task_description"], str
300            ):
301                raise ValueError(
302                    "task_description is optional, but if provided must be a string"
303                )
304            return self
305        else:
306            raise ValueError(f"Invalid eval config type: {self.config_type}")
307
308    @model_validator(mode="after")
309    def validate_json_serializable(self) -> "EvalConfig":
310        try:
311            # This will raise a TypeError if the dict contains non-JSON-serializable objects
312            json.dumps(self.properties)
313        except TypeError as e:
314            raise ValueError(f"Properties must be JSON serializable: {e!s}")
315        return self
316
317
318class EvalDataType(str, Enum):
319    final_answer = "final_answer"
320    full_trace = "full_trace"
321    reference_answer = "reference_answer"
322
323
324class Eval(KilnParentedModel, KilnParentModel, parent_of={"configs": EvalConfig}):
325    name: FilenameString = Field(description="The name of the eval.")
326    description: str | None = Field(
327        default=None, description="The description of the eval"
328    )
329    template: EvalTemplateId | None = Field(
330        default=None,
331        description="The template selected when creating this eval. Useful for suggesting eval steps and output scores.",
332    )
333    current_config_id: ID_TYPE = Field(
334        default=None,
335        description="The id of the current config to use for this eval. This can be changed over time to run the same eval with different configs.",
336    )
337    eval_set_filter_id: DatasetFilterId = Field(
338        description="The id of the dataset filter which defines which dataset items are included when running this eval. Should be mutually exclusive with eval_configs_filter_id and train_set_filter_id."
339    )
340    eval_configs_filter_id: DatasetFilterId | None = Field(
341        default=None,
342        description="The id of the dataset filter which defines which dataset items are included when comparing the quality of the eval configs under this eval. Should consist of dataset items with ratings. Should be mutually exclusive with eval_set_filter_id.",
343    )
344    train_set_filter_id: DatasetFilterId | None = Field(
345        default=None,
346        description="The id of the dataset filter which defines which dataset items are included in the training set for fine-tuning. Should be mutually exclusive with eval_set_filter_id.",
347    )
348    output_scores: List[EvalOutputScore] = Field(
349        description="The scores this evaluator should produce."
350    )
351    favourite: bool = Field(
352        default=False,
353        description="Whether this eval is a favourite of the user. Rendered as a star icon in the UI.",
354    )
355    template_properties: dict[str, str | int | bool | float] | None = Field(
356        default=None,
357        description="Properties to be used to execute the eval. This is template_type specific and should serialize to a json dict.",
358    )
359    evaluation_data_type: EvalDataType = Field(
360        default=EvalDataType.final_answer,
361        description="The output of the task run to evaluate. Can be final answer or full trace.",
362    )
363
364    # Workaround to return typed parent without importing Task
365    def parent_task(self) -> Union["Task", None]:
366        if self.parent is not None and self.parent.__class__.__name__ != "Task":
367            raise ValueError("parent must be a Task")
368        return self.parent  # type: ignore
369
370    def configs(self, readonly: bool = False) -> list[EvalConfig]:
371        return super().configs(readonly=readonly)  # type: ignore
372
373    # Workaround to return typed parent without importing Spec
374    def associated_spec(self, readonly: bool = False) -> Union["Spec", None]:
375        """
376        Get the spec associated with this eval, if any.
377        Returns None for legacy evals that are not associated with a spec.
378        """
379
380        task = self.parent_task()
381        if not task or not self.id:
382            return None
383
384        specs = task.specs(readonly=readonly)
385        for spec in specs:
386            if spec.eval_id == self.id:
387                return spec
388        return None
389
390    @model_validator(mode="after")
391    def upgrade_old_reference_answer_eval_config(self) -> Self:
392        """
393        Migration: Set the first judge config as the default for existing reference answer evals that don't have a current_config_id set.
394
395        For reference_answer evals that don't have a current_config_id set, this migration
396        will set the first config (by created_at) as the default.
397        """
398        if self.id is None:
399            return self
400
401        # Only run during file loading
402        if not self._loaded_from_file:
403            return self
404
405        # Skip if already migrated (has a current_config_id set)
406        if self.current_config_id is not None:
407            return self
408
409        # Only migrate reference_answer evals
410        if self.evaluation_data_type != EvalDataType.reference_answer:
411            return self
412
413        # Prevent recursion: self.configs() loads child files, which re-loads this parent
414        # (see basemodel.py where we iterate_children_paths_of_parent_path calls load_from_file)
415        # This causes the validator to run again, creating an infinite loop without this guard.
416        with _migration_lock:
417            if self.id in _currently_migrating_eval_ids:
418                return self
419            _currently_migrating_eval_ids.add(self.id)
420
421        try:
422            # Get the configs - these are loaded from child files
423            configs_list = self.configs(readonly=True)
424            if configs_list and len(configs_list) > 0:
425                # Sort by created_at to get the oldest (first created) config
426                sorted_configs = sorted(configs_list, key=lambda c: c.created_at)
427                self.current_config_id = sorted_configs[0].id
428        finally:
429            with _migration_lock:
430                _currently_migrating_eval_ids.discard(self.id)
431
432        return self
433
434    @model_validator(mode="after")
435    def validate_scores(self) -> Self:
436        if self.output_scores is None or len(self.output_scores) == 0:
437            raise ValueError(
438                "output_scores are required, and must have at least one score."
439            )
440
441        # check for duplicate names (once transformed to JSON keys)
442        output_score_keys = [score.json_key() for score in self.output_scores]
443        if len(output_score_keys) != len(set(output_score_keys)):
444            raise ValueError(
445                f"output_scores must have unique names (once transformed to JSON keys). Got: [{', '.join(output_score_keys)}]"
446            )
447        return self
448
449    @model_validator(mode="after")
450    def validate_template_properties(self) -> Self:
451        # eval_configs_filter_id is required for all templates except "rag"
452        if (
453            self.template is not EvalTemplateId.rag
454            and self.eval_configs_filter_id is None
455        ):
456            raise ValueError(
457                "eval_configs_filter_id is required for all templates except 'rag'"
458            )
459
460        # For spec-based evals, template_properties will be None and validation happens in the spec
461        # For legacy evals, template_properties contains the data and we validate here
462        if self.template_properties is None:
463            return self
464
465        # Check for properties that are required for the issue template (legacy evals only)
466        if self.template == EvalTemplateId.issue:
467            if "issue_prompt" not in self.template_properties or not isinstance(
468                self.template_properties["issue_prompt"], str
469            ):
470                raise ValueError("issue_prompt is required for issue template")
471            if "failure_example" in self.template_properties and not isinstance(
472                self.template_properties["failure_example"], str
473            ):
474                raise ValueError(
475                    "failure_example is optional for issue template, but if provided must be a string"
476                )
477            if "pass_example" in self.template_properties and not isinstance(
478                self.template_properties["pass_example"], str
479            ):
480                raise ValueError(
481                    "pass_example is optional for issue template, but if provided must be a string"
482                )
483
484        if self.template == EvalTemplateId.tool_call:
485            if self.evaluation_data_type != EvalDataType.full_trace:
486                raise ValueError(
487                    "tool_call template should have evaluation_data_type set to full_trace"
488                )
489            if (
490                "tool" not in self.template_properties
491                or not isinstance(self.template_properties["tool"], str)
492                or not self.template_properties["tool"].strip()
493            ):
494                raise ValueError("tool is required for tool call template")
495            if "tool_function_name" not in self.template_properties or not isinstance(
496                self.template_properties["tool_function_name"], str
497            ):
498                raise ValueError(
499                    "tool_function_name is required for tool call template"
500                )
501            if (
502                "appropriate_tool_use_guidelines" not in self.template_properties
503                or not isinstance(
504                    self.template_properties["appropriate_tool_use_guidelines"], str
505                )
506                or not self.template_properties[
507                    "appropriate_tool_use_guidelines"
508                ].strip()
509            ):
510                raise ValueError(
511                    "appropriate_tool_use_guidelines is required for tool call template"
512                )
513            if (
514                "inappropriate_tool_use_guidelines" in self.template_properties
515                and not isinstance(
516                    self.template_properties["inappropriate_tool_use_guidelines"], str
517                )
518            ):
519                raise ValueError(
520                    "inappropriate_tool_use_guidelines is optional for tool call template, but if provided must be a string"
521                )
522        return self
EvalScores = typing.Dict[str, float]
class EvalTemplateId(builtins.str, enum.Enum):
35class EvalTemplateId(str, Enum):
36    """
37    An eval template is a pre-defined eval that can be used as a starting point for a new eval.
38    """
39
40    kiln_requirements = "kiln_requirements"
41    desired_behaviour = "desired_behaviour"
42    issue = "kiln_issue"
43    tool_call = "tool_call"
44    toxicity = "toxicity"
45    bias = "bias"
46    maliciousness = "maliciousness"
47    factual_correctness = "factual_correctness"
48    jailbreak = "jailbreak"
49    rag = "rag"

An eval template is a pre-defined eval that can be used as a starting point for a new eval.

kiln_requirements = <EvalTemplateId.kiln_requirements: 'kiln_requirements'>
desired_behaviour = <EvalTemplateId.desired_behaviour: 'desired_behaviour'>
issue = <EvalTemplateId.issue: 'kiln_issue'>
tool_call = <EvalTemplateId.tool_call: 'tool_call'>
toxicity = <EvalTemplateId.toxicity: 'toxicity'>
bias = <EvalTemplateId.bias: 'bias'>
maliciousness = <EvalTemplateId.maliciousness: 'maliciousness'>
factual_correctness = <EvalTemplateId.factual_correctness: 'factual_correctness'>
jailbreak = <EvalTemplateId.jailbreak: 'jailbreak'>
rag = <EvalTemplateId.rag: 'rag'>
class EvalConfigType(builtins.str, enum.Enum):
52class EvalConfigType(str, Enum):
53    g_eval = "g_eval"
54    llm_as_judge = "llm_as_judge"

str(object='') -> str str(bytes_or_buffer[, encoding[, errors]]) -> str

Create a new string object from the given object. If encoding or errors is specified, then the object must expose a data buffer that will be decoded using the given encoding and error handler. Otherwise, returns the result of object.__str__() (if defined) or repr(object). encoding defaults to 'utf-8'. errors defaults to 'strict'.

g_eval = <EvalConfigType.g_eval: 'g_eval'>
llm_as_judge = <EvalConfigType.llm_as_judge: 'llm_as_judge'>
class EvalOutputScore(pydantic.main.BaseModel):
57class EvalOutputScore(BaseModel):
58    """
59    A definition of a score that an evaluator will produce.
60
61    Very similar to TaskRequirement, but conceptually different keeping in a separate models.
62    """
63
64    name: FilenameStringShort = Field(
65        description="The name of the score. Will be provided to the model so use a descriptive name. Should align to the model's TaskRequirement name if you want to use human evals to evaluate the evaluator's performance."
66    )
67    instruction: str | None = Field(
68        default=None,
69        description="A description of the score, used to help the model understand the goal of the score. Will be provided to evaluator models, so should be written for the model, not the team/user.",
70    )
71    type: TaskOutputRatingType = Field(
72        description="The type of rating to use ('five_star', 'pass_fail', 'pass_fail_critical').",
73    )
74
75    def json_key(self) -> str:
76        """
77        The JSON key for the score, used when running the evaluator with a LLM and we need JSON output.
78
79        For example, "Overall Rating" -> "overall_rating"
80        """
81        return string_to_json_key(self.name)
82
83    @model_validator(mode="after")
84    def validate_type(self) -> Self:
85        if self.type == TaskOutputRatingType.custom:
86            raise ValueError(
87                f"Custom scores are not supported in evaluators. Score '{self.name}' was set to a custom score."
88            )
89        return self

A definition of a score that an evaluator will produce.

Very similar to TaskRequirement, but conceptually different keeping in a separate models.

name: Annotated[str, BeforeValidator(func=<function name_validator.<locals>.fn at 0x7fa807e66480>, json_schema_input_type=PydanticUndefined)]
instruction: str | None
def json_key(self) -> str:
75    def json_key(self) -> str:
76        """
77        The JSON key for the score, used when running the evaluator with a LLM and we need JSON output.
78
79        For example, "Overall Rating" -> "overall_rating"
80        """
81        return string_to_json_key(self.name)

The JSON key for the score, used when running the evaluator with a LLM and we need JSON output.

For example, "Overall Rating" -> "overall_rating"

@model_validator(mode='after')
def validate_type(self) -> Self:
83    @model_validator(mode="after")
84    def validate_type(self) -> Self:
85        if self.type == TaskOutputRatingType.custom:
86            raise ValueError(
87                f"Custom scores are not supported in evaluators. Score '{self.name}' was set to a custom score."
88            )
89        return self
model_config: ClassVar[pydantic.config.ConfigDict] = {}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

class EvalRun(kiln_ai.datamodel.basemodel.KilnParentedModel):
 92class EvalRun(KilnParentedModel):
 93    """
 94    The results of running an eval on a single dataset item.
 95
 96    This is a child of an EvalConfig, which specifies how the scores were generated.
 97
 98    Eval runs can be one of 2 types:
 99    1) eval_config_eval=False: we were evaluating a task run (a method of running the task). We get the task input from the dataset_id.input, run the task with the task_run_config, then ran the evaluator on that output. task_run_config_id must be set. The output saved in this model is the output of the task run.
100    2) eval_config_eval=True: we were evaluating an eval config (a method of evaluating the task). We used the existing dataset item input/output, and ran the evaluator on it. task_run_config_id must be None. The input/output saved in this model is the input/output of the dataset item.
101    """
102
103    dataset_id: ID_TYPE = Field(
104        description="The ID of the dataset item that was used for this run. Must belong to the same Task as the grand-parent eval of this EvalRun."
105    )
106    task_run_config_id: ID_TYPE | None = Field(
107        description="The ID of the TaskRunConfig that was run, if this eval run was based on a task run. Must belong to the same Task as this eval. Can be None if this eval run is based on an eval config."
108    )
109    eval_config_eval: bool = Field(
110        description="Whether this eval run to evaluate the parent eval config (evaluating the config using an existing dataset item). If true, task_run_config_id must be None, as we're not running the task.",
111        default=False,
112    )
113    # These two may duplicate the dataset_id.input/output, but we're denormalizing intentionally.
114    input: str = Field(
115        description="The input to the task. JSON formatted for structured input, plaintext for unstructured input."
116    )
117    output: str = Field(
118        description="The output of the task. JSON formatted for structured output, plaintext for unstructured output."
119    )
120    reference_answer: str | None = Field(
121        default=None,
122        description="The reference answer for the input. JSON formatted for structured reference answer, plaintext for unstructured reference answer. Used for reference answer evals.",
123    )
124    intermediate_outputs: Dict[str, str] | None = Field(
125        default=None,
126        description="The intermediate outputs of the task (example, eval thinking).",
127    )
128    task_run_trace: str | None = Field(
129        default=None,
130        description="The JSON formatted trace of the task run that produced the output.",
131    )
132    scores: EvalScores = Field(
133        description="The output scores of the evaluator (aligning to those required by the grand-parent Eval this object is a child of)."
134    )
135    task_run_usage: Usage | None = Field(
136        default=None,
137        description="The usage of the task run that produced this eval run output (not the usage by the evaluation model).",
138    )
139
140    def parent_eval_config(self) -> Union["EvalConfig", None]:
141        if self.parent is not None and self.parent.__class__.__name__ != "EvalConfig":
142            raise ValueError("parent must be an EvalConfig")
143        return self.parent  # type: ignore
144
145    @model_validator(mode="after")
146    def validate_output_fields(self) -> Self:
147        parent_eval_config = self.parent_eval_config()
148        parent_eval = parent_eval_config.parent_eval() if parent_eval_config else None
149        if not parent_eval:
150            return self
151
152        evaluation_data_type = parent_eval.evaluation_data_type
153        if (
154            evaluation_data_type == EvalDataType.final_answer
155            and self.task_run_trace is not None
156        ):
157            raise ValueError("final_answer runs should not set trace")
158        elif (
159            not self.eval_config_eval
160            and evaluation_data_type == EvalDataType.full_trace
161            and self.task_run_trace is None
162        ):
163            raise ValueError("full_trace task run eval runs should include trace")
164
165        return self
166
167    @model_validator(mode="after")
168    def validate_eval_run_types(self) -> Self:
169        if self.eval_config_eval and self.task_run_config_id is not None:
170            raise ValueError(
171                "task_run_config_id must be None if eval_config_eval is true"
172            )
173        if not self.eval_config_eval and self.task_run_config_id is None:
174            raise ValueError(
175                "task_run_config_id must be set if eval_config_eval is false"
176            )
177        return self
178
179    @model_validator(mode="after")
180    def validate_scores(self) -> Self:
181        # We're checking the scores have the expected keys from the grand-parent eval
182        if self.scores is None or len(self.scores) == 0:
183            raise ValueError("scores are required, and must have at least one score.")
184
185        parent_eval_config = self.parent_eval_config()
186        eval = parent_eval_config.parent_eval() if parent_eval_config else None
187        if not eval:
188            # Can't validate without the grand-parent eval, allow it to be validated later
189            return self
190
191        output_score_keys = [score.json_key() for score in eval.output_scores]
192        if set(output_score_keys) != set(self.scores.keys()):
193            raise ValueError(
194                f"The scores produced by the evaluator must match the scores expected by the eval. Got: [{', '.join(self.scores.keys())}] and expected: [{', '.join(output_score_keys)}]"
195            )
196
197        # Check that each score is expected in this eval and the correct type
198        for output_score in eval.output_scores:
199            match output_score.type:
200                case TaskOutputRatingType.five_star:
201                    five_star_score = self.scores[output_score.json_key()]
202                    if (
203                        not isinstance(five_star_score, float)
204                        or five_star_score < 1.0
205                        or five_star_score > 5.0
206                    ):
207                        raise ValueError(
208                            f"Score {output_score.name} is a five_star rating and must be a float between 1.0 and 5.0 inclusive. Got: {five_star_score}"
209                        )
210                case TaskOutputRatingType.pass_fail:
211                    pass_fail_score = self.scores[output_score.json_key()]
212                    if (
213                        not isinstance(pass_fail_score, float)
214                        or pass_fail_score < 0.0
215                        or pass_fail_score > 1.0
216                    ):
217                        raise ValueError(
218                            f"Score {output_score.name} is a pass_fail rating and must be a float between 0.0 and 1.0 inclusive. Got: {pass_fail_score}"
219                        )
220                case TaskOutputRatingType.pass_fail_critical:
221                    pass_fail_critical_score = self.scores[output_score.json_key()]
222                    if (
223                        not isinstance(pass_fail_critical_score, float)
224                        or pass_fail_critical_score < -1.0
225                        or pass_fail_critical_score > 1.0
226                    ):
227                        raise ValueError(
228                            f"Score {output_score.name} is a pass_fail_critical rating and must be a float between -1.0 and 1.0 inclusive. Got: {pass_fail_critical_score}"
229                        )
230                case TaskOutputRatingType.custom:
231                    raise ValueError(
232                        f"Custom scores are not supported in evaluators. '{output_score.name}' was set to a custom score."
233                    )
234                case _:
235                    # Catch missing cases
236                    raise_exhaustive_enum_error(output_score.type)
237        return self
238
239    @model_validator(mode="after")
240    def validate_reference_answer(self) -> Self:
241        parent_eval_config = self.parent_eval_config()
242        parent_eval = parent_eval_config.parent_eval() if parent_eval_config else None
243        if not parent_eval:
244            # Can't validate without the grand-parent eval, allow it to be validated later
245            return self
246
247        evaluation_data_type = parent_eval.evaluation_data_type
248        if (
249            self.reference_answer is not None
250            and evaluation_data_type != EvalDataType.reference_answer
251        ):
252            raise ValueError(
253                f"reference_answer is only valid for reference answer evals. Got: {evaluation_data_type.value}"
254            )
255        return self

The results of running an eval on a single dataset item.

This is a child of an EvalConfig, which specifies how the scores were generated.

Eval runs can be one of 2 types: 1) eval_config_eval=False: we were evaluating a task run (a method of running the task). We get the task input from the dataset_id.input, run the task with the task_run_config, then ran the evaluator on that output. task_run_config_id must be set. The output saved in this model is the output of the task run. 2) eval_config_eval=True: we were evaluating an eval config (a method of evaluating the task). We used the existing dataset item input/output, and ran the evaluator on it. task_run_config_id must be None. The input/output saved in this model is the input/output of the dataset item.

dataset_id: Optional[str]
task_run_config_id: Optional[str]
eval_config_eval: bool
input: str
output: str
reference_answer: str | None
intermediate_outputs: Optional[Dict[str, str]]
task_run_trace: str | None
scores: Dict[str, float]
task_run_usage: kiln_ai.datamodel.Usage | None
def parent_eval_config(self) -> Optional[EvalConfig]:
140    def parent_eval_config(self) -> Union["EvalConfig", None]:
141        if self.parent is not None and self.parent.__class__.__name__ != "EvalConfig":
142            raise ValueError("parent must be an EvalConfig")
143        return self.parent  # type: ignore
@model_validator(mode='after')
def validate_output_fields(self) -> Self:
145    @model_validator(mode="after")
146    def validate_output_fields(self) -> Self:
147        parent_eval_config = self.parent_eval_config()
148        parent_eval = parent_eval_config.parent_eval() if parent_eval_config else None
149        if not parent_eval:
150            return self
151
152        evaluation_data_type = parent_eval.evaluation_data_type
153        if (
154            evaluation_data_type == EvalDataType.final_answer
155            and self.task_run_trace is not None
156        ):
157            raise ValueError("final_answer runs should not set trace")
158        elif (
159            not self.eval_config_eval
160            and evaluation_data_type == EvalDataType.full_trace
161            and self.task_run_trace is None
162        ):
163            raise ValueError("full_trace task run eval runs should include trace")
164
165        return self
@model_validator(mode='after')
def validate_eval_run_types(self) -> Self:
167    @model_validator(mode="after")
168    def validate_eval_run_types(self) -> Self:
169        if self.eval_config_eval and self.task_run_config_id is not None:
170            raise ValueError(
171                "task_run_config_id must be None if eval_config_eval is true"
172            )
173        if not self.eval_config_eval and self.task_run_config_id is None:
174            raise ValueError(
175                "task_run_config_id must be set if eval_config_eval is false"
176            )
177        return self
@model_validator(mode='after')
def validate_scores(self) -> Self:
179    @model_validator(mode="after")
180    def validate_scores(self) -> Self:
181        # We're checking the scores have the expected keys from the grand-parent eval
182        if self.scores is None or len(self.scores) == 0:
183            raise ValueError("scores are required, and must have at least one score.")
184
185        parent_eval_config = self.parent_eval_config()
186        eval = parent_eval_config.parent_eval() if parent_eval_config else None
187        if not eval:
188            # Can't validate without the grand-parent eval, allow it to be validated later
189            return self
190
191        output_score_keys = [score.json_key() for score in eval.output_scores]
192        if set(output_score_keys) != set(self.scores.keys()):
193            raise ValueError(
194                f"The scores produced by the evaluator must match the scores expected by the eval. Got: [{', '.join(self.scores.keys())}] and expected: [{', '.join(output_score_keys)}]"
195            )
196
197        # Check that each score is expected in this eval and the correct type
198        for output_score in eval.output_scores:
199            match output_score.type:
200                case TaskOutputRatingType.five_star:
201                    five_star_score = self.scores[output_score.json_key()]
202                    if (
203                        not isinstance(five_star_score, float)
204                        or five_star_score < 1.0
205                        or five_star_score > 5.0
206                    ):
207                        raise ValueError(
208                            f"Score {output_score.name} is a five_star rating and must be a float between 1.0 and 5.0 inclusive. Got: {five_star_score}"
209                        )
210                case TaskOutputRatingType.pass_fail:
211                    pass_fail_score = self.scores[output_score.json_key()]
212                    if (
213                        not isinstance(pass_fail_score, float)
214                        or pass_fail_score < 0.0
215                        or pass_fail_score > 1.0
216                    ):
217                        raise ValueError(
218                            f"Score {output_score.name} is a pass_fail rating and must be a float between 0.0 and 1.0 inclusive. Got: {pass_fail_score}"
219                        )
220                case TaskOutputRatingType.pass_fail_critical:
221                    pass_fail_critical_score = self.scores[output_score.json_key()]
222                    if (
223                        not isinstance(pass_fail_critical_score, float)
224                        or pass_fail_critical_score < -1.0
225                        or pass_fail_critical_score > 1.0
226                    ):
227                        raise ValueError(
228                            f"Score {output_score.name} is a pass_fail_critical rating and must be a float between -1.0 and 1.0 inclusive. Got: {pass_fail_critical_score}"
229                        )
230                case TaskOutputRatingType.custom:
231                    raise ValueError(
232                        f"Custom scores are not supported in evaluators. '{output_score.name}' was set to a custom score."
233                    )
234                case _:
235                    # Catch missing cases
236                    raise_exhaustive_enum_error(output_score.type)
237        return self
@model_validator(mode='after')
def validate_reference_answer(self) -> Self:
239    @model_validator(mode="after")
240    def validate_reference_answer(self) -> Self:
241        parent_eval_config = self.parent_eval_config()
242        parent_eval = parent_eval_config.parent_eval() if parent_eval_config else None
243        if not parent_eval:
244            # Can't validate without the grand-parent eval, allow it to be validated later
245            return self
246
247        evaluation_data_type = parent_eval.evaluation_data_type
248        if (
249            self.reference_answer is not None
250            and evaluation_data_type != EvalDataType.reference_answer
251        ):
252            raise ValueError(
253                f"reference_answer is only valid for reference answer evals. Got: {evaluation_data_type.value}"
254            )
255        return self
def relationship_name() -> str:
727        def relationship_name_method() -> str:
728            return relationship_name

The type of the None singleton.

def parent_type() -> Type[kiln_ai.datamodel.basemodel.KilnParentModel]:
720        def parent_class_method() -> Type[KilnParentModel]:
721            return cls

The type of the None singleton.

model_config = {'validate_assignment': True}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

def model_post_init(self: pydantic.main.BaseModel, context: Any, /) -> None:
337def init_private_attributes(self: BaseModel, context: Any, /) -> None:
338    """This function is meant to behave like a BaseModel method to initialise private attributes.
339
340    It takes context as an argument since that's what pydantic-core passes when calling it.
341
342    Args:
343        self: The BaseModel instance.
344        context: The context.
345    """
346    if getattr(self, '__pydantic_private__', None) is None:
347        pydantic_private = {}
348        for name, private_attr in self.__private_attributes__.items():
349            default = private_attr.get_default()
350            if default is not PydanticUndefined:
351                pydantic_private[name] = default
352        object_setattr(self, '__pydantic_private__', pydantic_private)

This function is meant to behave like a BaseModel method to initialise private attributes.

It takes context as an argument since that's what pydantic-core passes when calling it.

Args: self: The BaseModel instance. context: The context.

class EvalConfig(kiln_ai.datamodel.basemodel.KilnParentedModel, kiln_ai.datamodel.basemodel.KilnParentModel):
258class EvalConfig(KilnParentedModel, KilnParentModel, parent_of={"runs": EvalRun}):
259    """
260    A configuration for running an eval. This includes anything needed to run the eval on a dataset like the prompt, model, thresholds, etc.
261
262    A eval might have many configs, example running the same eval with 2 different models. Comparing eval results is only valid within the scope of the same config.
263    """
264
265    name: FilenameString = Field(description="The name of the eval config.")
266    model_name: str = Field(
267        description="The name of the model to use for this eval config. ",
268    )
269    model_provider: str = Field(
270        description="The provider of the model to use for this eval config.",
271    )
272    config_type: EvalConfigType = Field(
273        default=EvalConfigType.g_eval,
274        description="This is used to determine the type of eval to run.",
275    )
276    properties: dict[str, Any] = Field(
277        default={},
278        description="Properties to be used to execute the eval config. This is config_type specific and should serialize to a json dict.",
279    )
280
281    def parent_eval(self) -> Union["Eval", None]:
282        if self.parent is not None and self.parent.__class__.__name__ != "Eval":
283            raise ValueError("parent must be an Eval")
284        return self.parent  # type: ignore
285
286    def runs(self, readonly: bool = False) -> list[EvalRun]:
287        return super().runs(readonly=readonly)  # type: ignore
288
289    @model_validator(mode="after")
290    def validate_properties(self) -> Self:
291        if (
292            self.config_type == EvalConfigType.g_eval
293            or self.config_type == EvalConfigType.llm_as_judge
294        ):
295            if "eval_steps" not in self.properties or not isinstance(
296                self.properties["eval_steps"], list
297            ):
298                raise ValueError("eval_steps is required and must be a list for g_eval")
299            if "task_description" in self.properties and not isinstance(
300                self.properties["task_description"], str
301            ):
302                raise ValueError(
303                    "task_description is optional, but if provided must be a string"
304                )
305            return self
306        else:
307            raise ValueError(f"Invalid eval config type: {self.config_type}")
308
309    @model_validator(mode="after")
310    def validate_json_serializable(self) -> "EvalConfig":
311        try:
312            # This will raise a TypeError if the dict contains non-JSON-serializable objects
313            json.dumps(self.properties)
314        except TypeError as e:
315            raise ValueError(f"Properties must be JSON serializable: {e!s}")
316        return self

A configuration for running an eval. This includes anything needed to run the eval on a dataset like the prompt, model, thresholds, etc.

A eval might have many configs, example running the same eval with 2 different models. Comparing eval results is only valid within the scope of the same config.

name: Annotated[str, BeforeValidator(func=<function name_validator.<locals>.fn at 0x7fa807e65260>, json_schema_input_type=PydanticUndefined)]
model_name: str
model_provider: str
config_type: EvalConfigType
properties: dict[str, typing.Any]
def parent_eval(self) -> Optional[Eval]:
281    def parent_eval(self) -> Union["Eval", None]:
282        if self.parent is not None and self.parent.__class__.__name__ != "Eval":
283            raise ValueError("parent must be an Eval")
284        return self.parent  # type: ignore
def runs(self, readonly=False) -> List[EvalRun]:
709        def child_method(self, readonly: bool = False) -> list[child_class]:  # type: ignore[invalid-type-form]
710            return child_class.all_children_of_parent_path(self.path, readonly=readonly)

The type of the None singleton.

@model_validator(mode='after')
def validate_properties(self) -> Self:
289    @model_validator(mode="after")
290    def validate_properties(self) -> Self:
291        if (
292            self.config_type == EvalConfigType.g_eval
293            or self.config_type == EvalConfigType.llm_as_judge
294        ):
295            if "eval_steps" not in self.properties or not isinstance(
296                self.properties["eval_steps"], list
297            ):
298                raise ValueError("eval_steps is required and must be a list for g_eval")
299            if "task_description" in self.properties and not isinstance(
300                self.properties["task_description"], str
301            ):
302                raise ValueError(
303                    "task_description is optional, but if provided must be a string"
304                )
305            return self
306        else:
307            raise ValueError(f"Invalid eval config type: {self.config_type}")
@model_validator(mode='after')
def validate_json_serializable(self) -> EvalConfig:
309    @model_validator(mode="after")
310    def validate_json_serializable(self) -> "EvalConfig":
311        try:
312            # This will raise a TypeError if the dict contains non-JSON-serializable objects
313            json.dumps(self.properties)
314        except TypeError as e:
315            raise ValueError(f"Properties must be JSON serializable: {e!s}")
316        return self
def relationship_name() -> str:
727        def relationship_name_method() -> str:
728            return relationship_name

The type of the None singleton.

def parent_type() -> Type[kiln_ai.datamodel.basemodel.KilnParentModel]:
720        def parent_class_method() -> Type[KilnParentModel]:
721            return cls

The type of the None singleton.

model_config = {'validate_assignment': True}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

def model_post_init(self: pydantic.main.BaseModel, context: Any, /) -> None:
337def init_private_attributes(self: BaseModel, context: Any, /) -> None:
338    """This function is meant to behave like a BaseModel method to initialise private attributes.
339
340    It takes context as an argument since that's what pydantic-core passes when calling it.
341
342    Args:
343        self: The BaseModel instance.
344        context: The context.
345    """
346    if getattr(self, '__pydantic_private__', None) is None:
347        pydantic_private = {}
348        for name, private_attr in self.__private_attributes__.items():
349            default = private_attr.get_default()
350            if default is not PydanticUndefined:
351                pydantic_private[name] = default
352        object_setattr(self, '__pydantic_private__', pydantic_private)

This function is meant to behave like a BaseModel method to initialise private attributes.

It takes context as an argument since that's what pydantic-core passes when calling it.

Args: self: The BaseModel instance. context: The context.

class EvalDataType(builtins.str, enum.Enum):
319class EvalDataType(str, Enum):
320    final_answer = "final_answer"
321    full_trace = "full_trace"
322    reference_answer = "reference_answer"

str(object='') -> str str(bytes_or_buffer[, encoding[, errors]]) -> str

Create a new string object from the given object. If encoding or errors is specified, then the object must expose a data buffer that will be decoded using the given encoding and error handler. Otherwise, returns the result of object.__str__() (if defined) or repr(object). encoding defaults to 'utf-8'. errors defaults to 'strict'.

final_answer = <EvalDataType.final_answer: 'final_answer'>
full_trace = <EvalDataType.full_trace: 'full_trace'>
reference_answer = <EvalDataType.reference_answer: 'reference_answer'>
class Eval(kiln_ai.datamodel.basemodel.KilnParentedModel, kiln_ai.datamodel.basemodel.KilnParentModel):
325class Eval(KilnParentedModel, KilnParentModel, parent_of={"configs": EvalConfig}):
326    name: FilenameString = Field(description="The name of the eval.")
327    description: str | None = Field(
328        default=None, description="The description of the eval"
329    )
330    template: EvalTemplateId | None = Field(
331        default=None,
332        description="The template selected when creating this eval. Useful for suggesting eval steps and output scores.",
333    )
334    current_config_id: ID_TYPE = Field(
335        default=None,
336        description="The id of the current config to use for this eval. This can be changed over time to run the same eval with different configs.",
337    )
338    eval_set_filter_id: DatasetFilterId = Field(
339        description="The id of the dataset filter which defines which dataset items are included when running this eval. Should be mutually exclusive with eval_configs_filter_id and train_set_filter_id."
340    )
341    eval_configs_filter_id: DatasetFilterId | None = Field(
342        default=None,
343        description="The id of the dataset filter which defines which dataset items are included when comparing the quality of the eval configs under this eval. Should consist of dataset items with ratings. Should be mutually exclusive with eval_set_filter_id.",
344    )
345    train_set_filter_id: DatasetFilterId | None = Field(
346        default=None,
347        description="The id of the dataset filter which defines which dataset items are included in the training set for fine-tuning. Should be mutually exclusive with eval_set_filter_id.",
348    )
349    output_scores: List[EvalOutputScore] = Field(
350        description="The scores this evaluator should produce."
351    )
352    favourite: bool = Field(
353        default=False,
354        description="Whether this eval is a favourite of the user. Rendered as a star icon in the UI.",
355    )
356    template_properties: dict[str, str | int | bool | float] | None = Field(
357        default=None,
358        description="Properties to be used to execute the eval. This is template_type specific and should serialize to a json dict.",
359    )
360    evaluation_data_type: EvalDataType = Field(
361        default=EvalDataType.final_answer,
362        description="The output of the task run to evaluate. Can be final answer or full trace.",
363    )
364
365    # Workaround to return typed parent without importing Task
366    def parent_task(self) -> Union["Task", None]:
367        if self.parent is not None and self.parent.__class__.__name__ != "Task":
368            raise ValueError("parent must be a Task")
369        return self.parent  # type: ignore
370
371    def configs(self, readonly: bool = False) -> list[EvalConfig]:
372        return super().configs(readonly=readonly)  # type: ignore
373
374    # Workaround to return typed parent without importing Spec
375    def associated_spec(self, readonly: bool = False) -> Union["Spec", None]:
376        """
377        Get the spec associated with this eval, if any.
378        Returns None for legacy evals that are not associated with a spec.
379        """
380
381        task = self.parent_task()
382        if not task or not self.id:
383            return None
384
385        specs = task.specs(readonly=readonly)
386        for spec in specs:
387            if spec.eval_id == self.id:
388                return spec
389        return None
390
391    @model_validator(mode="after")
392    def upgrade_old_reference_answer_eval_config(self) -> Self:
393        """
394        Migration: Set the first judge config as the default for existing reference answer evals that don't have a current_config_id set.
395
396        For reference_answer evals that don't have a current_config_id set, this migration
397        will set the first config (by created_at) as the default.
398        """
399        if self.id is None:
400            return self
401
402        # Only run during file loading
403        if not self._loaded_from_file:
404            return self
405
406        # Skip if already migrated (has a current_config_id set)
407        if self.current_config_id is not None:
408            return self
409
410        # Only migrate reference_answer evals
411        if self.evaluation_data_type != EvalDataType.reference_answer:
412            return self
413
414        # Prevent recursion: self.configs() loads child files, which re-loads this parent
415        # (see basemodel.py where we iterate_children_paths_of_parent_path calls load_from_file)
416        # This causes the validator to run again, creating an infinite loop without this guard.
417        with _migration_lock:
418            if self.id in _currently_migrating_eval_ids:
419                return self
420            _currently_migrating_eval_ids.add(self.id)
421
422        try:
423            # Get the configs - these are loaded from child files
424            configs_list = self.configs(readonly=True)
425            if configs_list and len(configs_list) > 0:
426                # Sort by created_at to get the oldest (first created) config
427                sorted_configs = sorted(configs_list, key=lambda c: c.created_at)
428                self.current_config_id = sorted_configs[0].id
429        finally:
430            with _migration_lock:
431                _currently_migrating_eval_ids.discard(self.id)
432
433        return self
434
435    @model_validator(mode="after")
436    def validate_scores(self) -> Self:
437        if self.output_scores is None or len(self.output_scores) == 0:
438            raise ValueError(
439                "output_scores are required, and must have at least one score."
440            )
441
442        # check for duplicate names (once transformed to JSON keys)
443        output_score_keys = [score.json_key() for score in self.output_scores]
444        if len(output_score_keys) != len(set(output_score_keys)):
445            raise ValueError(
446                f"output_scores must have unique names (once transformed to JSON keys). Got: [{', '.join(output_score_keys)}]"
447            )
448        return self
449
450    @model_validator(mode="after")
451    def validate_template_properties(self) -> Self:
452        # eval_configs_filter_id is required for all templates except "rag"
453        if (
454            self.template is not EvalTemplateId.rag
455            and self.eval_configs_filter_id is None
456        ):
457            raise ValueError(
458                "eval_configs_filter_id is required for all templates except 'rag'"
459            )
460
461        # For spec-based evals, template_properties will be None and validation happens in the spec
462        # For legacy evals, template_properties contains the data and we validate here
463        if self.template_properties is None:
464            return self
465
466        # Check for properties that are required for the issue template (legacy evals only)
467        if self.template == EvalTemplateId.issue:
468            if "issue_prompt" not in self.template_properties or not isinstance(
469                self.template_properties["issue_prompt"], str
470            ):
471                raise ValueError("issue_prompt is required for issue template")
472            if "failure_example" in self.template_properties and not isinstance(
473                self.template_properties["failure_example"], str
474            ):
475                raise ValueError(
476                    "failure_example is optional for issue template, but if provided must be a string"
477                )
478            if "pass_example" in self.template_properties and not isinstance(
479                self.template_properties["pass_example"], str
480            ):
481                raise ValueError(
482                    "pass_example is optional for issue template, but if provided must be a string"
483                )
484
485        if self.template == EvalTemplateId.tool_call:
486            if self.evaluation_data_type != EvalDataType.full_trace:
487                raise ValueError(
488                    "tool_call template should have evaluation_data_type set to full_trace"
489                )
490            if (
491                "tool" not in self.template_properties
492                or not isinstance(self.template_properties["tool"], str)
493                or not self.template_properties["tool"].strip()
494            ):
495                raise ValueError("tool is required for tool call template")
496            if "tool_function_name" not in self.template_properties or not isinstance(
497                self.template_properties["tool_function_name"], str
498            ):
499                raise ValueError(
500                    "tool_function_name is required for tool call template"
501                )
502            if (
503                "appropriate_tool_use_guidelines" not in self.template_properties
504                or not isinstance(
505                    self.template_properties["appropriate_tool_use_guidelines"], str
506                )
507                or not self.template_properties[
508                    "appropriate_tool_use_guidelines"
509                ].strip()
510            ):
511                raise ValueError(
512                    "appropriate_tool_use_guidelines is required for tool call template"
513                )
514            if (
515                "inappropriate_tool_use_guidelines" in self.template_properties
516                and not isinstance(
517                    self.template_properties["inappropriate_tool_use_guidelines"], str
518                )
519            ):
520                raise ValueError(
521                    "inappropriate_tool_use_guidelines is optional for tool call template, but if provided must be a string"
522                )
523        return self

Base model for Kiln models that have a parent-child relationship. This base class is for child models.

This class provides functionality for managing hierarchical relationships between models, including parent reference handling and file system organization.

Attributes: parent (KilnBaseModel): Reference to the parent model instance. Not persisted, just in memory.

name: Annotated[str, BeforeValidator(func=<function name_validator.<locals>.fn at 0x7fa807e65260>, json_schema_input_type=PydanticUndefined)]
description: str | None
template: EvalTemplateId | None
current_config_id: Optional[str]
eval_set_filter_id: Annotated[str, AfterValidator(func=<function <lambda> at 0x7fa806ef1120>)]
eval_configs_filter_id: Optional[Annotated[str, AfterValidator(func=<function <lambda> at 0x7fa806ef1120>)]]
train_set_filter_id: Optional[Annotated[str, AfterValidator(func=<function <lambda> at 0x7fa806ef1120>)]]
output_scores: List[EvalOutputScore]
favourite: bool
template_properties: dict[str, str | int | bool | float] | None
evaluation_data_type: EvalDataType
def parent_task(self) -> Optional[kiln_ai.datamodel.Task]:
366    def parent_task(self) -> Union["Task", None]:
367        if self.parent is not None and self.parent.__class__.__name__ != "Task":
368            raise ValueError("parent must be a Task")
369        return self.parent  # type: ignore
def configs(self, readonly=False) -> List[EvalConfig]:
709        def child_method(self, readonly: bool = False) -> list[child_class]:  # type: ignore[invalid-type-form]
710            return child_class.all_children_of_parent_path(self.path, readonly=readonly)

The type of the None singleton.

def associated_spec(self, readonly: bool = False) -> Optional[kiln_ai.datamodel.spec.Spec]:
375    def associated_spec(self, readonly: bool = False) -> Union["Spec", None]:
376        """
377        Get the spec associated with this eval, if any.
378        Returns None for legacy evals that are not associated with a spec.
379        """
380
381        task = self.parent_task()
382        if not task or not self.id:
383            return None
384
385        specs = task.specs(readonly=readonly)
386        for spec in specs:
387            if spec.eval_id == self.id:
388                return spec
389        return None

Get the spec associated with this eval, if any. Returns None for legacy evals that are not associated with a spec.

@model_validator(mode='after')
def upgrade_old_reference_answer_eval_config(self) -> Self:
391    @model_validator(mode="after")
392    def upgrade_old_reference_answer_eval_config(self) -> Self:
393        """
394        Migration: Set the first judge config as the default for existing reference answer evals that don't have a current_config_id set.
395
396        For reference_answer evals that don't have a current_config_id set, this migration
397        will set the first config (by created_at) as the default.
398        """
399        if self.id is None:
400            return self
401
402        # Only run during file loading
403        if not self._loaded_from_file:
404            return self
405
406        # Skip if already migrated (has a current_config_id set)
407        if self.current_config_id is not None:
408            return self
409
410        # Only migrate reference_answer evals
411        if self.evaluation_data_type != EvalDataType.reference_answer:
412            return self
413
414        # Prevent recursion: self.configs() loads child files, which re-loads this parent
415        # (see basemodel.py where we iterate_children_paths_of_parent_path calls load_from_file)
416        # This causes the validator to run again, creating an infinite loop without this guard.
417        with _migration_lock:
418            if self.id in _currently_migrating_eval_ids:
419                return self
420            _currently_migrating_eval_ids.add(self.id)
421
422        try:
423            # Get the configs - these are loaded from child files
424            configs_list = self.configs(readonly=True)
425            if configs_list and len(configs_list) > 0:
426                # Sort by created_at to get the oldest (first created) config
427                sorted_configs = sorted(configs_list, key=lambda c: c.created_at)
428                self.current_config_id = sorted_configs[0].id
429        finally:
430            with _migration_lock:
431                _currently_migrating_eval_ids.discard(self.id)
432
433        return self

Migration: Set the first judge config as the default for existing reference answer evals that don't have a current_config_id set.

For reference_answer evals that don't have a current_config_id set, this migration will set the first config (by created_at) as the default.

@model_validator(mode='after')
def validate_scores(self) -> Self:
435    @model_validator(mode="after")
436    def validate_scores(self) -> Self:
437        if self.output_scores is None or len(self.output_scores) == 0:
438            raise ValueError(
439                "output_scores are required, and must have at least one score."
440            )
441
442        # check for duplicate names (once transformed to JSON keys)
443        output_score_keys = [score.json_key() for score in self.output_scores]
444        if len(output_score_keys) != len(set(output_score_keys)):
445            raise ValueError(
446                f"output_scores must have unique names (once transformed to JSON keys). Got: [{', '.join(output_score_keys)}]"
447            )
448        return self
@model_validator(mode='after')
def validate_template_properties(self) -> Self:
450    @model_validator(mode="after")
451    def validate_template_properties(self) -> Self:
452        # eval_configs_filter_id is required for all templates except "rag"
453        if (
454            self.template is not EvalTemplateId.rag
455            and self.eval_configs_filter_id is None
456        ):
457            raise ValueError(
458                "eval_configs_filter_id is required for all templates except 'rag'"
459            )
460
461        # For spec-based evals, template_properties will be None and validation happens in the spec
462        # For legacy evals, template_properties contains the data and we validate here
463        if self.template_properties is None:
464            return self
465
466        # Check for properties that are required for the issue template (legacy evals only)
467        if self.template == EvalTemplateId.issue:
468            if "issue_prompt" not in self.template_properties or not isinstance(
469                self.template_properties["issue_prompt"], str
470            ):
471                raise ValueError("issue_prompt is required for issue template")
472            if "failure_example" in self.template_properties and not isinstance(
473                self.template_properties["failure_example"], str
474            ):
475                raise ValueError(
476                    "failure_example is optional for issue template, but if provided must be a string"
477                )
478            if "pass_example" in self.template_properties and not isinstance(
479                self.template_properties["pass_example"], str
480            ):
481                raise ValueError(
482                    "pass_example is optional for issue template, but if provided must be a string"
483                )
484
485        if self.template == EvalTemplateId.tool_call:
486            if self.evaluation_data_type != EvalDataType.full_trace:
487                raise ValueError(
488                    "tool_call template should have evaluation_data_type set to full_trace"
489                )
490            if (
491                "tool" not in self.template_properties
492                or not isinstance(self.template_properties["tool"], str)
493                or not self.template_properties["tool"].strip()
494            ):
495                raise ValueError("tool is required for tool call template")
496            if "tool_function_name" not in self.template_properties or not isinstance(
497                self.template_properties["tool_function_name"], str
498            ):
499                raise ValueError(
500                    "tool_function_name is required for tool call template"
501                )
502            if (
503                "appropriate_tool_use_guidelines" not in self.template_properties
504                or not isinstance(
505                    self.template_properties["appropriate_tool_use_guidelines"], str
506                )
507                or not self.template_properties[
508                    "appropriate_tool_use_guidelines"
509                ].strip()
510            ):
511                raise ValueError(
512                    "appropriate_tool_use_guidelines is required for tool call template"
513                )
514            if (
515                "inappropriate_tool_use_guidelines" in self.template_properties
516                and not isinstance(
517                    self.template_properties["inappropriate_tool_use_guidelines"], str
518                )
519            ):
520                raise ValueError(
521                    "inappropriate_tool_use_guidelines is optional for tool call template, but if provided must be a string"
522                )
523        return self
def relationship_name() -> str:
727        def relationship_name_method() -> str:
728            return relationship_name

The type of the None singleton.

def parent_type() -> Type[kiln_ai.datamodel.basemodel.KilnParentModel]:
720        def parent_class_method() -> Type[KilnParentModel]:
721            return cls

The type of the None singleton.

model_config = {'validate_assignment': True}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

def model_post_init(self: pydantic.main.BaseModel, context: Any, /) -> None:
337def init_private_attributes(self: BaseModel, context: Any, /) -> None:
338    """This function is meant to behave like a BaseModel method to initialise private attributes.
339
340    It takes context as an argument since that's what pydantic-core passes when calling it.
341
342    Args:
343        self: The BaseModel instance.
344        context: The context.
345    """
346    if getattr(self, '__pydantic_private__', None) is None:
347        pydantic_private = {}
348        for name, private_attr in self.__private_attributes__.items():
349            default = private_attr.get_default()
350            if default is not PydanticUndefined:
351                pydantic_private[name] = default
352        object_setattr(self, '__pydantic_private__', pydantic_private)

This function is meant to behave like a BaseModel method to initialise private attributes.

It takes context as an argument since that's what pydantic-core passes when calling it.

Args: self: The BaseModel instance. context: The context.