kiln_ai.datamodel.eval

  1import json
  2from enum import Enum
  3from threading import Lock
  4from typing import TYPE_CHECKING, Any, Dict, List, Union
  5
  6from pydantic import BaseModel, Field, model_validator
  7from typing_extensions import Self
  8
  9from kiln_ai.datamodel.basemodel import (
 10    ID_TYPE,
 11    FilenameString,
 12    KilnParentedModel,
 13    KilnParentModel,
 14)
 15from kiln_ai.datamodel.datamodel_enums import TaskOutputRatingType
 16from kiln_ai.datamodel.dataset_filters import DatasetFilterId
 17from kiln_ai.datamodel.json_schema import string_to_json_key
 18from kiln_ai.datamodel.task_run import Usage
 19from kiln_ai.utils.exhaustive_error import raise_exhaustive_enum_error
 20
 21if TYPE_CHECKING:
 22    from kiln_ai.datamodel.task import Task
 23
 24EvalScores = Dict[str, float]
 25
 26# Module-level set to track evals currently being migrated (to prevent recursion)
 27# Protected by _migration_lock to ensure thread-safe access
 28_migration_lock = Lock()
 29_currently_migrating_eval_ids: set[ID_TYPE] = set()
 30
 31
 32class EvalTemplateId(str, Enum):
 33    """
 34    An eval template is a pre-defined eval that can be used as a starting point for a new eval.
 35    """
 36
 37    kiln_requirements = "kiln_requirements"
 38    issue = "kiln_issue"
 39    tool_call = "tool_call"
 40    toxicity = "toxicity"
 41    bias = "bias"
 42    maliciousness = "maliciousness"
 43    factual_correctness = "factual_correctness"
 44    jailbreak = "jailbreak"
 45    rag = "rag"
 46
 47
 48class EvalConfigType(str, Enum):
 49    g_eval = "g_eval"
 50    llm_as_judge = "llm_as_judge"
 51
 52
 53class EvalOutputScore(BaseModel):
 54    """
 55    A definition of a score that an evaluator will produce.
 56
 57    Very similar to TaskRequirement, but conceptually different keeping in a separate models.
 58    """
 59
 60    name: str = Field(
 61        description="The name of the score. Will be provided to the model so use a descriptive name. Should align to the model's TaskRequirement name if you want to use human evals to evaluate the evaluator's performance."
 62    )
 63    instruction: str | None = Field(
 64        default=None,
 65        description="A description of the score, used to help the model understand the goal of the score. Will be provided to evaluator models, so should be written for the model, not the team/user.",
 66    )
 67    type: TaskOutputRatingType = Field(
 68        description="The type of rating to use ('five_star', 'pass_fail', 'pass_fail_critical')."
 69    )
 70
 71    def json_key(self) -> str:
 72        """
 73        The JSON key for the score, used when running the evaluator with a LLM and we need JSON output.
 74
 75        For example, "Overall Rating" -> "overall_rating"
 76        """
 77        return string_to_json_key(self.name)
 78
 79    @model_validator(mode="after")
 80    def validate_type(self) -> Self:
 81        if self.type == TaskOutputRatingType.custom:
 82            raise ValueError(
 83                f"Custom scores are not supported in evaluators. Score '{self.name}' was set to a custom score."
 84            )
 85        return self
 86
 87
 88class EvalRun(KilnParentedModel):
 89    """
 90    The results of running an eval on a single dataset item.
 91
 92    This is a child of an EvalConfig, which specifies how the scores were generated.
 93
 94    Eval runs can be one of 2 types:
 95    1) eval_config_eval=False: we were evaluating a task run (a method of running the task). We get the task input from the dataset_id.input, run the task with the task_run_config, then ran the evaluator on that output. task_run_config_id must be set. The output saved in this model is the output of the task run.
 96    2) eval_config_eval=True: we were evaluating an eval config (a method of evaluating the task). We used the existing dataset item input/output, and ran the evaluator on it. task_run_config_id must be None. The input/output saved in this model is the input/output of the dataset item.
 97    """
 98
 99    dataset_id: ID_TYPE = Field(
100        description="The ID of the dataset item that was used for this run. Must belong to the same Task as the grand-parent eval of this EvalRun."
101    )
102    task_run_config_id: ID_TYPE | None = Field(
103        description="The ID of the TaskRunConfig that was run, if this eval run was based on a task run. Must belong to the same Task as this eval. Can be None if this eval run is based on an eval config."
104    )
105    eval_config_eval: bool = Field(
106        description="Whether this eval run to evaluate the parent eval config (evaluating the config using an existing dataset item). If true, task_run_config_id must be None, as we're not running the task.",
107        default=False,
108    )
109    # These two may duplicate the dataset_id.input/output, but we're denormalizing intentionally.
110    input: str = Field(
111        description="The input to the task. JSON formatted for structured input, plaintext for unstructured input."
112    )
113    output: str = Field(
114        description="The output of the task. JSON formatted for structured output, plaintext for unstructured output."
115    )
116    reference_answer: str | None = Field(
117        default=None,
118        description="The reference answer for the input. JSON formatted for structured reference answer, plaintext for unstructured reference answer. Used for reference answer evals.",
119    )
120    intermediate_outputs: Dict[str, str] | None = Field(
121        default=None,
122        description="The intermediate outputs of the task (example, eval thinking).",
123    )
124    task_run_trace: str | None = Field(
125        default=None,
126        description="The JSON formatted trace of the task run that produced the output.",
127    )
128    scores: EvalScores = Field(
129        description="The output scores of the evaluator (aligning to those required by the grand-parent Eval this object is a child of)."
130    )
131    task_run_usage: Usage | None = Field(
132        default=None,
133        description="The usage of the task run that produced this eval run output (not the usage by the evaluation model).",
134    )
135
136    def parent_eval_config(self) -> Union["EvalConfig", None]:
137        if self.parent is not None and self.parent.__class__.__name__ != "EvalConfig":
138            raise ValueError("parent must be an EvalConfig")
139        return self.parent  # type: ignore
140
141    @model_validator(mode="after")
142    def validate_output_fields(self) -> Self:
143        parent_eval_config = self.parent_eval_config()
144        parent_eval = parent_eval_config.parent_eval() if parent_eval_config else None
145        if not parent_eval:
146            return self
147
148        evaluation_data_type = parent_eval.evaluation_data_type
149        if (
150            evaluation_data_type == EvalDataType.final_answer
151            and self.task_run_trace is not None
152        ):
153            raise ValueError("final_answer runs should not set trace")
154        elif (
155            not self.eval_config_eval
156            and evaluation_data_type == EvalDataType.full_trace
157            and self.task_run_trace is None
158        ):
159            raise ValueError("full_trace task run eval runs should include trace")
160
161        return self
162
163    @model_validator(mode="after")
164    def validate_eval_run_types(self) -> Self:
165        if self.eval_config_eval and self.task_run_config_id is not None:
166            raise ValueError(
167                "task_run_config_id must be None if eval_config_eval is true"
168            )
169        if not self.eval_config_eval and self.task_run_config_id is None:
170            raise ValueError(
171                "task_run_config_id must be set if eval_config_eval is false"
172            )
173        return self
174
175    @model_validator(mode="after")
176    def validate_scores(self) -> Self:
177        # We're checking the scores have the expected keys from the grand-parent eval
178        if self.scores is None or len(self.scores) == 0:
179            raise ValueError("scores are required, and must have at least one score.")
180
181        parent_eval_config = self.parent_eval_config()
182        eval = parent_eval_config.parent_eval() if parent_eval_config else None
183        if not eval:
184            # Can't validate without the grand-parent eval, allow it to be validated later
185            return self
186
187        output_score_keys = [score.json_key() for score in eval.output_scores]
188        if set(output_score_keys) != set(self.scores.keys()):
189            raise ValueError(
190                f"The scores produced by the evaluator must match the scores expected by the eval. Got: [{', '.join(self.scores.keys())}] and expected: [{', '.join(output_score_keys)}]"
191            )
192
193        # Check that each score is expected in this eval and the correct type
194        for output_score in eval.output_scores:
195            match output_score.type:
196                case TaskOutputRatingType.five_star:
197                    five_star_score = self.scores[output_score.json_key()]
198                    if (
199                        not isinstance(five_star_score, float)
200                        or five_star_score < 1.0
201                        or five_star_score > 5.0
202                    ):
203                        raise ValueError(
204                            f"Score {output_score.name} is a five_star rating and must be a float between 1.0 and 5.0 inclusive. Got: {five_star_score}"
205                        )
206                case TaskOutputRatingType.pass_fail:
207                    pass_fail_score = self.scores[output_score.json_key()]
208                    if (
209                        not isinstance(pass_fail_score, float)
210                        or pass_fail_score < 0.0
211                        or pass_fail_score > 1.0
212                    ):
213                        raise ValueError(
214                            f"Score {output_score.name} is a pass_fail rating and must be a float between 0.0 and 1.0 inclusive. Got: {pass_fail_score}"
215                        )
216                case TaskOutputRatingType.pass_fail_critical:
217                    pass_fail_critical_score = self.scores[output_score.json_key()]
218                    if (
219                        not isinstance(pass_fail_critical_score, float)
220                        or pass_fail_critical_score < -1.0
221                        or pass_fail_critical_score > 1.0
222                    ):
223                        raise ValueError(
224                            f"Score {output_score.name} is a pass_fail_critical rating and must be a float between -1.0 and 1.0 inclusive. Got: {pass_fail_critical_score}"
225                        )
226                case TaskOutputRatingType.custom:
227                    raise ValueError(
228                        f"Custom scores are not supported in evaluators. '{output_score.name}' was set to a custom score."
229                    )
230                case _:
231                    # Catch missing cases
232                    raise_exhaustive_enum_error(output_score.type)
233        return self
234
235    @model_validator(mode="after")
236    def validate_reference_answer(self) -> Self:
237        parent_eval_config = self.parent_eval_config()
238        parent_eval = parent_eval_config.parent_eval() if parent_eval_config else None
239        if not parent_eval:
240            # Can't validate without the grand-parent eval, allow it to be validated later
241            return self
242
243        evaluation_data_type = parent_eval.evaluation_data_type
244        if (
245            self.reference_answer is not None
246            and evaluation_data_type != EvalDataType.reference_answer
247        ):
248            raise ValueError(
249                f"reference_answer is only valid for reference answer evals. Got: {evaluation_data_type.value}"
250            )
251        return self
252
253
254class EvalConfig(KilnParentedModel, KilnParentModel, parent_of={"runs": EvalRun}):
255    """
256    A configuration for running an eval. This includes anything needed to run the eval on a dataset like the prompt, model, thresholds, etc.
257
258    A eval might have many configs, example running the same eval with 2 different models. Comparing eval results is only valid within the scope of the same config.
259    """
260
261    name: FilenameString = Field(description="The name of the eval config.")
262    model_name: str = Field(
263        description="The name of the model to use for this eval config. ",
264    )
265    model_provider: str = Field(
266        description="The provider of the model to use for this eval config.",
267    )
268    config_type: EvalConfigType = Field(
269        default=EvalConfigType.g_eval,
270        description="This is used to determine the type of eval to run.",
271    )
272    properties: dict[str, Any] = Field(
273        default={},
274        description="Properties to be used to execute the eval config. This is config_type specific and should serialize to a json dict.",
275    )
276
277    def parent_eval(self) -> Union["Eval", None]:
278        if self.parent is not None and self.parent.__class__.__name__ != "Eval":
279            raise ValueError("parent must be an Eval")
280        return self.parent  # type: ignore
281
282    def runs(self, readonly: bool = False) -> list[EvalRun]:
283        return super().runs(readonly=readonly)  # type: ignore
284
285    @model_validator(mode="after")
286    def validate_properties(self) -> Self:
287        if (
288            self.config_type == EvalConfigType.g_eval
289            or self.config_type == EvalConfigType.llm_as_judge
290        ):
291            if "eval_steps" not in self.properties or not isinstance(
292                self.properties["eval_steps"], list
293            ):
294                raise ValueError("eval_steps is required and must be a list for g_eval")
295            if "task_description" in self.properties and not isinstance(
296                self.properties["task_description"], str
297            ):
298                raise ValueError(
299                    "task_description is optional, but if provided must be a string"
300                )
301            return self
302        else:
303            raise ValueError(f"Invalid eval config type: {self.config_type}")
304
305    @model_validator(mode="after")
306    def validate_json_serializable(self) -> "EvalConfig":
307        try:
308            # This will raise a TypeError if the dict contains non-JSON-serializable objects
309            json.dumps(self.properties)
310        except TypeError as e:
311            raise ValueError(f"Properties must be JSON serializable: {e!s}")
312        return self
313
314
315class EvalDataType(str, Enum):
316    final_answer = "final_answer"
317    full_trace = "full_trace"
318    reference_answer = "reference_answer"
319
320
321class Eval(KilnParentedModel, KilnParentModel, parent_of={"configs": EvalConfig}):
322    name: FilenameString = Field(description="The name of the eval.")
323    description: str | None = Field(
324        default=None, description="The description of the eval"
325    )
326    template: EvalTemplateId | None = Field(
327        default=None,
328        description="The template selected when creating this eval. Useful for suggesting eval steps and output scores.",
329    )
330    current_config_id: ID_TYPE = Field(
331        default=None,
332        description="The id of the current config to use for this eval. This can be changed over time to run the same eval with different configs.",
333    )
334    eval_set_filter_id: DatasetFilterId = Field(
335        description="The id of the dataset filter which defines which dataset items are included when running this eval. Should be mutually exclusive with eval_configs_filter_id."
336    )
337    eval_configs_filter_id: DatasetFilterId | None = Field(
338        default=None,
339        description="The id of the dataset filter which defines which dataset items are included when comparing the quality of the eval configs under this eval. Should consist of dataset items with ratings. Should be mutually exclusive with eval_set_filter_id.",
340    )
341    output_scores: List[EvalOutputScore] = Field(
342        description="The scores this evaluator should produce."
343    )
344    favourite: bool = Field(
345        default=False,
346        description="Whether this eval is a favourite of the user. Rendered as a star icon in the UI.",
347    )
348    template_properties: dict[str, str | int | bool | float] = Field(
349        default={},
350        description="Properties to be used to execute the eval. This is template_type specific and should serialize to a json dict.",
351    )
352    evaluation_data_type: EvalDataType = Field(
353        default=EvalDataType.final_answer,
354        description="The output of the task run to evaluate. Can be final answer or full trace.",
355    )
356
357    # Workaround to return typed parent without importing Task
358    def parent_task(self) -> Union["Task", None]:
359        if self.parent is not None and self.parent.__class__.__name__ != "Task":
360            raise ValueError("parent must be a Task")
361        return self.parent  # type: ignore
362
363    def configs(self, readonly: bool = False) -> list[EvalConfig]:
364        return super().configs(readonly=readonly)  # type: ignore
365
366    @model_validator(mode="after")
367    def upgrade_old_reference_answer_eval_config(self) -> Self:
368        """
369        Migration: Set the first judge config as the default for existing reference answer evals that don't have a current_config_id set.
370
371        For reference_answer evals that don't have a current_config_id set, this migration
372        will set the first config (by created_at) as the default.
373        """
374        if self.id is None:
375            return self
376
377        # Only run during file loading
378        if not self._loaded_from_file:
379            return self
380
381        # Skip if already migrated (has a current_config_id set)
382        if self.current_config_id is not None:
383            return self
384
385        # Only migrate reference_answer evals
386        if self.evaluation_data_type != EvalDataType.reference_answer:
387            return self
388
389        # Prevent recursion: self.configs() loads child files, which re-loads this parent
390        # (see basemodel.py where we iterate_children_paths_of_parent_path calls load_from_file)
391        # This causes the validator to run again, creating an infinite loop without this guard.
392        with _migration_lock:
393            if self.id in _currently_migrating_eval_ids:
394                return self
395            _currently_migrating_eval_ids.add(self.id)
396
397        try:
398            # Get the configs - these are loaded from child files
399            configs_list = self.configs(readonly=True)
400            if configs_list and len(configs_list) > 0:
401                # Sort by created_at to get the oldest (first created) config
402                sorted_configs = sorted(configs_list, key=lambda c: c.created_at)
403                self.current_config_id = sorted_configs[0].id
404        finally:
405            with _migration_lock:
406                _currently_migrating_eval_ids.discard(self.id)
407
408        return self
409
410    @model_validator(mode="after")
411    def validate_scores(self) -> Self:
412        if self.output_scores is None or len(self.output_scores) == 0:
413            raise ValueError(
414                "output_scores are required, and must have at least one score."
415            )
416
417        # check for duplicate names (once transformed to JSON keys)
418        output_score_keys = [score.json_key() for score in self.output_scores]
419        if len(output_score_keys) != len(set(output_score_keys)):
420            raise ValueError(
421                f"output_scores must have unique names (once transformed to JSON keys). Got: [{', '.join(output_score_keys)}]"
422            )
423        return self
424
425    @model_validator(mode="after")
426    def validate_template_properties(self) -> Self:
427        # eval_configs_filter_id is required for all templates except "rag"
428        if (
429            self.template is not EvalTemplateId.rag
430            and self.eval_configs_filter_id is None
431        ):
432            raise ValueError(
433                "eval_configs_filter_id is required for all templates except 'rag'"
434            )
435
436        # Check for properties that are required for the issue template
437        if self.template == EvalTemplateId.issue:
438            if "issue_prompt" not in self.template_properties or not isinstance(
439                self.template_properties["issue_prompt"], str
440            ):
441                raise ValueError("issue_prompt is required for issue template")
442            if "failure_example" in self.template_properties and not isinstance(
443                self.template_properties["failure_example"], str
444            ):
445                raise ValueError(
446                    "failure_example is optional for issue template, but if provided must be a string"
447                )
448            if "pass_example" in self.template_properties and not isinstance(
449                self.template_properties["pass_example"], str
450            ):
451                raise ValueError(
452                    "pass_example is optional for issue template, but if provided must be a string"
453                )
454
455        if self.template == EvalTemplateId.tool_call:
456            if self.evaluation_data_type != EvalDataType.full_trace:
457                raise ValueError(
458                    "tool_call template should have evaluation_data_type set to full_trace"
459                )
460            if (
461                "tool" not in self.template_properties
462                or not isinstance(self.template_properties["tool"], str)
463                or not self.template_properties["tool"].strip()
464            ):
465                raise ValueError("tool is required for tool call template")
466            if "tool_function_name" not in self.template_properties or not isinstance(
467                self.template_properties["tool_function_name"], str
468            ):
469                raise ValueError(
470                    "tool_function_name is required for tool call template"
471                )
472            if (
473                "appropriate_tool_use_guidelines" not in self.template_properties
474                or not isinstance(
475                    self.template_properties["appropriate_tool_use_guidelines"], str
476                )
477                or not self.template_properties[
478                    "appropriate_tool_use_guidelines"
479                ].strip()
480            ):
481                raise ValueError(
482                    "appropriate_tool_use_guidelines is required for tool call template"
483                )
484            if (
485                "inappropriate_tool_use_guidelines" in self.template_properties
486                and not isinstance(
487                    self.template_properties["inappropriate_tool_use_guidelines"], str
488                )
489            ):
490                raise ValueError(
491                    "inappropriate_tool_use_guidelines is optional for tool call template, but if provided must be a string"
492                )
493        return self
EvalScores = typing.Dict[str, float]
class EvalTemplateId(builtins.str, enum.Enum):
33class EvalTemplateId(str, Enum):
34    """
35    An eval template is a pre-defined eval that can be used as a starting point for a new eval.
36    """
37
38    kiln_requirements = "kiln_requirements"
39    issue = "kiln_issue"
40    tool_call = "tool_call"
41    toxicity = "toxicity"
42    bias = "bias"
43    maliciousness = "maliciousness"
44    factual_correctness = "factual_correctness"
45    jailbreak = "jailbreak"
46    rag = "rag"

An eval template is a pre-defined eval that can be used as a starting point for a new eval.

kiln_requirements = <EvalTemplateId.kiln_requirements: 'kiln_requirements'>
issue = <EvalTemplateId.issue: 'kiln_issue'>
tool_call = <EvalTemplateId.tool_call: 'tool_call'>
toxicity = <EvalTemplateId.toxicity: 'toxicity'>
bias = <EvalTemplateId.bias: 'bias'>
maliciousness = <EvalTemplateId.maliciousness: 'maliciousness'>
factual_correctness = <EvalTemplateId.factual_correctness: 'factual_correctness'>
jailbreak = <EvalTemplateId.jailbreak: 'jailbreak'>
rag = <EvalTemplateId.rag: 'rag'>
class EvalConfigType(builtins.str, enum.Enum):
49class EvalConfigType(str, Enum):
50    g_eval = "g_eval"
51    llm_as_judge = "llm_as_judge"

str(object='') -> str str(bytes_or_buffer[, encoding[, errors]]) -> str

Create a new string object from the given object. If encoding or errors is specified, then the object must expose a data buffer that will be decoded using the given encoding and error handler. Otherwise, returns the result of object.__str__() (if defined) or repr(object). encoding defaults to 'utf-8'. errors defaults to 'strict'.

g_eval = <EvalConfigType.g_eval: 'g_eval'>
llm_as_judge = <EvalConfigType.llm_as_judge: 'llm_as_judge'>
class EvalOutputScore(pydantic.main.BaseModel):
54class EvalOutputScore(BaseModel):
55    """
56    A definition of a score that an evaluator will produce.
57
58    Very similar to TaskRequirement, but conceptually different keeping in a separate models.
59    """
60
61    name: str = Field(
62        description="The name of the score. Will be provided to the model so use a descriptive name. Should align to the model's TaskRequirement name if you want to use human evals to evaluate the evaluator's performance."
63    )
64    instruction: str | None = Field(
65        default=None,
66        description="A description of the score, used to help the model understand the goal of the score. Will be provided to evaluator models, so should be written for the model, not the team/user.",
67    )
68    type: TaskOutputRatingType = Field(
69        description="The type of rating to use ('five_star', 'pass_fail', 'pass_fail_critical')."
70    )
71
72    def json_key(self) -> str:
73        """
74        The JSON key for the score, used when running the evaluator with a LLM and we need JSON output.
75
76        For example, "Overall Rating" -> "overall_rating"
77        """
78        return string_to_json_key(self.name)
79
80    @model_validator(mode="after")
81    def validate_type(self) -> Self:
82        if self.type == TaskOutputRatingType.custom:
83            raise ValueError(
84                f"Custom scores are not supported in evaluators. Score '{self.name}' was set to a custom score."
85            )
86        return self

A definition of a score that an evaluator will produce.

Very similar to TaskRequirement, but conceptually different keeping in a separate models.

name: str
instruction: str | None
def json_key(self) -> str:
72    def json_key(self) -> str:
73        """
74        The JSON key for the score, used when running the evaluator with a LLM and we need JSON output.
75
76        For example, "Overall Rating" -> "overall_rating"
77        """
78        return string_to_json_key(self.name)

The JSON key for the score, used when running the evaluator with a LLM and we need JSON output.

For example, "Overall Rating" -> "overall_rating"

@model_validator(mode='after')
def validate_type(self) -> Self:
80    @model_validator(mode="after")
81    def validate_type(self) -> Self:
82        if self.type == TaskOutputRatingType.custom:
83            raise ValueError(
84                f"Custom scores are not supported in evaluators. Score '{self.name}' was set to a custom score."
85            )
86        return self
model_config: ClassVar[pydantic.config.ConfigDict] = {}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

class EvalRun(kiln_ai.datamodel.basemodel.KilnParentedModel):
 89class EvalRun(KilnParentedModel):
 90    """
 91    The results of running an eval on a single dataset item.
 92
 93    This is a child of an EvalConfig, which specifies how the scores were generated.
 94
 95    Eval runs can be one of 2 types:
 96    1) eval_config_eval=False: we were evaluating a task run (a method of running the task). We get the task input from the dataset_id.input, run the task with the task_run_config, then ran the evaluator on that output. task_run_config_id must be set. The output saved in this model is the output of the task run.
 97    2) eval_config_eval=True: we were evaluating an eval config (a method of evaluating the task). We used the existing dataset item input/output, and ran the evaluator on it. task_run_config_id must be None. The input/output saved in this model is the input/output of the dataset item.
 98    """
 99
100    dataset_id: ID_TYPE = Field(
101        description="The ID of the dataset item that was used for this run. Must belong to the same Task as the grand-parent eval of this EvalRun."
102    )
103    task_run_config_id: ID_TYPE | None = Field(
104        description="The ID of the TaskRunConfig that was run, if this eval run was based on a task run. Must belong to the same Task as this eval. Can be None if this eval run is based on an eval config."
105    )
106    eval_config_eval: bool = Field(
107        description="Whether this eval run to evaluate the parent eval config (evaluating the config using an existing dataset item). If true, task_run_config_id must be None, as we're not running the task.",
108        default=False,
109    )
110    # These two may duplicate the dataset_id.input/output, but we're denormalizing intentionally.
111    input: str = Field(
112        description="The input to the task. JSON formatted for structured input, plaintext for unstructured input."
113    )
114    output: str = Field(
115        description="The output of the task. JSON formatted for structured output, plaintext for unstructured output."
116    )
117    reference_answer: str | None = Field(
118        default=None,
119        description="The reference answer for the input. JSON formatted for structured reference answer, plaintext for unstructured reference answer. Used for reference answer evals.",
120    )
121    intermediate_outputs: Dict[str, str] | None = Field(
122        default=None,
123        description="The intermediate outputs of the task (example, eval thinking).",
124    )
125    task_run_trace: str | None = Field(
126        default=None,
127        description="The JSON formatted trace of the task run that produced the output.",
128    )
129    scores: EvalScores = Field(
130        description="The output scores of the evaluator (aligning to those required by the grand-parent Eval this object is a child of)."
131    )
132    task_run_usage: Usage | None = Field(
133        default=None,
134        description="The usage of the task run that produced this eval run output (not the usage by the evaluation model).",
135    )
136
137    def parent_eval_config(self) -> Union["EvalConfig", None]:
138        if self.parent is not None and self.parent.__class__.__name__ != "EvalConfig":
139            raise ValueError("parent must be an EvalConfig")
140        return self.parent  # type: ignore
141
142    @model_validator(mode="after")
143    def validate_output_fields(self) -> Self:
144        parent_eval_config = self.parent_eval_config()
145        parent_eval = parent_eval_config.parent_eval() if parent_eval_config else None
146        if not parent_eval:
147            return self
148
149        evaluation_data_type = parent_eval.evaluation_data_type
150        if (
151            evaluation_data_type == EvalDataType.final_answer
152            and self.task_run_trace is not None
153        ):
154            raise ValueError("final_answer runs should not set trace")
155        elif (
156            not self.eval_config_eval
157            and evaluation_data_type == EvalDataType.full_trace
158            and self.task_run_trace is None
159        ):
160            raise ValueError("full_trace task run eval runs should include trace")
161
162        return self
163
164    @model_validator(mode="after")
165    def validate_eval_run_types(self) -> Self:
166        if self.eval_config_eval and self.task_run_config_id is not None:
167            raise ValueError(
168                "task_run_config_id must be None if eval_config_eval is true"
169            )
170        if not self.eval_config_eval and self.task_run_config_id is None:
171            raise ValueError(
172                "task_run_config_id must be set if eval_config_eval is false"
173            )
174        return self
175
176    @model_validator(mode="after")
177    def validate_scores(self) -> Self:
178        # We're checking the scores have the expected keys from the grand-parent eval
179        if self.scores is None or len(self.scores) == 0:
180            raise ValueError("scores are required, and must have at least one score.")
181
182        parent_eval_config = self.parent_eval_config()
183        eval = parent_eval_config.parent_eval() if parent_eval_config else None
184        if not eval:
185            # Can't validate without the grand-parent eval, allow it to be validated later
186            return self
187
188        output_score_keys = [score.json_key() for score in eval.output_scores]
189        if set(output_score_keys) != set(self.scores.keys()):
190            raise ValueError(
191                f"The scores produced by the evaluator must match the scores expected by the eval. Got: [{', '.join(self.scores.keys())}] and expected: [{', '.join(output_score_keys)}]"
192            )
193
194        # Check that each score is expected in this eval and the correct type
195        for output_score in eval.output_scores:
196            match output_score.type:
197                case TaskOutputRatingType.five_star:
198                    five_star_score = self.scores[output_score.json_key()]
199                    if (
200                        not isinstance(five_star_score, float)
201                        or five_star_score < 1.0
202                        or five_star_score > 5.0
203                    ):
204                        raise ValueError(
205                            f"Score {output_score.name} is a five_star rating and must be a float between 1.0 and 5.0 inclusive. Got: {five_star_score}"
206                        )
207                case TaskOutputRatingType.pass_fail:
208                    pass_fail_score = self.scores[output_score.json_key()]
209                    if (
210                        not isinstance(pass_fail_score, float)
211                        or pass_fail_score < 0.0
212                        or pass_fail_score > 1.0
213                    ):
214                        raise ValueError(
215                            f"Score {output_score.name} is a pass_fail rating and must be a float between 0.0 and 1.0 inclusive. Got: {pass_fail_score}"
216                        )
217                case TaskOutputRatingType.pass_fail_critical:
218                    pass_fail_critical_score = self.scores[output_score.json_key()]
219                    if (
220                        not isinstance(pass_fail_critical_score, float)
221                        or pass_fail_critical_score < -1.0
222                        or pass_fail_critical_score > 1.0
223                    ):
224                        raise ValueError(
225                            f"Score {output_score.name} is a pass_fail_critical rating and must be a float between -1.0 and 1.0 inclusive. Got: {pass_fail_critical_score}"
226                        )
227                case TaskOutputRatingType.custom:
228                    raise ValueError(
229                        f"Custom scores are not supported in evaluators. '{output_score.name}' was set to a custom score."
230                    )
231                case _:
232                    # Catch missing cases
233                    raise_exhaustive_enum_error(output_score.type)
234        return self
235
236    @model_validator(mode="after")
237    def validate_reference_answer(self) -> Self:
238        parent_eval_config = self.parent_eval_config()
239        parent_eval = parent_eval_config.parent_eval() if parent_eval_config else None
240        if not parent_eval:
241            # Can't validate without the grand-parent eval, allow it to be validated later
242            return self
243
244        evaluation_data_type = parent_eval.evaluation_data_type
245        if (
246            self.reference_answer is not None
247            and evaluation_data_type != EvalDataType.reference_answer
248        ):
249            raise ValueError(
250                f"reference_answer is only valid for reference answer evals. Got: {evaluation_data_type.value}"
251            )
252        return self

The results of running an eval on a single dataset item.

This is a child of an EvalConfig, which specifies how the scores were generated.

Eval runs can be one of 2 types: 1) eval_config_eval=False: we were evaluating a task run (a method of running the task). We get the task input from the dataset_id.input, run the task with the task_run_config, then ran the evaluator on that output. task_run_config_id must be set. The output saved in this model is the output of the task run. 2) eval_config_eval=True: we were evaluating an eval config (a method of evaluating the task). We used the existing dataset item input/output, and ran the evaluator on it. task_run_config_id must be None. The input/output saved in this model is the input/output of the dataset item.

dataset_id: Optional[str]
task_run_config_id: Optional[str]
eval_config_eval: bool
input: str
output: str
reference_answer: str | None
intermediate_outputs: Optional[Dict[str, str]]
task_run_trace: str | None
scores: Dict[str, float]
task_run_usage: kiln_ai.datamodel.Usage | None
def parent_eval_config(self) -> Optional[EvalConfig]:
137    def parent_eval_config(self) -> Union["EvalConfig", None]:
138        if self.parent is not None and self.parent.__class__.__name__ != "EvalConfig":
139            raise ValueError("parent must be an EvalConfig")
140        return self.parent  # type: ignore
@model_validator(mode='after')
def validate_output_fields(self) -> Self:
142    @model_validator(mode="after")
143    def validate_output_fields(self) -> Self:
144        parent_eval_config = self.parent_eval_config()
145        parent_eval = parent_eval_config.parent_eval() if parent_eval_config else None
146        if not parent_eval:
147            return self
148
149        evaluation_data_type = parent_eval.evaluation_data_type
150        if (
151            evaluation_data_type == EvalDataType.final_answer
152            and self.task_run_trace is not None
153        ):
154            raise ValueError("final_answer runs should not set trace")
155        elif (
156            not self.eval_config_eval
157            and evaluation_data_type == EvalDataType.full_trace
158            and self.task_run_trace is None
159        ):
160            raise ValueError("full_trace task run eval runs should include trace")
161
162        return self
@model_validator(mode='after')
def validate_eval_run_types(self) -> Self:
164    @model_validator(mode="after")
165    def validate_eval_run_types(self) -> Self:
166        if self.eval_config_eval and self.task_run_config_id is not None:
167            raise ValueError(
168                "task_run_config_id must be None if eval_config_eval is true"
169            )
170        if not self.eval_config_eval and self.task_run_config_id is None:
171            raise ValueError(
172                "task_run_config_id must be set if eval_config_eval is false"
173            )
174        return self
@model_validator(mode='after')
def validate_scores(self) -> Self:
176    @model_validator(mode="after")
177    def validate_scores(self) -> Self:
178        # We're checking the scores have the expected keys from the grand-parent eval
179        if self.scores is None or len(self.scores) == 0:
180            raise ValueError("scores are required, and must have at least one score.")
181
182        parent_eval_config = self.parent_eval_config()
183        eval = parent_eval_config.parent_eval() if parent_eval_config else None
184        if not eval:
185            # Can't validate without the grand-parent eval, allow it to be validated later
186            return self
187
188        output_score_keys = [score.json_key() for score in eval.output_scores]
189        if set(output_score_keys) != set(self.scores.keys()):
190            raise ValueError(
191                f"The scores produced by the evaluator must match the scores expected by the eval. Got: [{', '.join(self.scores.keys())}] and expected: [{', '.join(output_score_keys)}]"
192            )
193
194        # Check that each score is expected in this eval and the correct type
195        for output_score in eval.output_scores:
196            match output_score.type:
197                case TaskOutputRatingType.five_star:
198                    five_star_score = self.scores[output_score.json_key()]
199                    if (
200                        not isinstance(five_star_score, float)
201                        or five_star_score < 1.0
202                        or five_star_score > 5.0
203                    ):
204                        raise ValueError(
205                            f"Score {output_score.name} is a five_star rating and must be a float between 1.0 and 5.0 inclusive. Got: {five_star_score}"
206                        )
207                case TaskOutputRatingType.pass_fail:
208                    pass_fail_score = self.scores[output_score.json_key()]
209                    if (
210                        not isinstance(pass_fail_score, float)
211                        or pass_fail_score < 0.0
212                        or pass_fail_score > 1.0
213                    ):
214                        raise ValueError(
215                            f"Score {output_score.name} is a pass_fail rating and must be a float between 0.0 and 1.0 inclusive. Got: {pass_fail_score}"
216                        )
217                case TaskOutputRatingType.pass_fail_critical:
218                    pass_fail_critical_score = self.scores[output_score.json_key()]
219                    if (
220                        not isinstance(pass_fail_critical_score, float)
221                        or pass_fail_critical_score < -1.0
222                        or pass_fail_critical_score > 1.0
223                    ):
224                        raise ValueError(
225                            f"Score {output_score.name} is a pass_fail_critical rating and must be a float between -1.0 and 1.0 inclusive. Got: {pass_fail_critical_score}"
226                        )
227                case TaskOutputRatingType.custom:
228                    raise ValueError(
229                        f"Custom scores are not supported in evaluators. '{output_score.name}' was set to a custom score."
230                    )
231                case _:
232                    # Catch missing cases
233                    raise_exhaustive_enum_error(output_score.type)
234        return self
@model_validator(mode='after')
def validate_reference_answer(self) -> Self:
236    @model_validator(mode="after")
237    def validate_reference_answer(self) -> Self:
238        parent_eval_config = self.parent_eval_config()
239        parent_eval = parent_eval_config.parent_eval() if parent_eval_config else None
240        if not parent_eval:
241            # Can't validate without the grand-parent eval, allow it to be validated later
242            return self
243
244        evaluation_data_type = parent_eval.evaluation_data_type
245        if (
246            self.reference_answer is not None
247            and evaluation_data_type != EvalDataType.reference_answer
248        ):
249            raise ValueError(
250                f"reference_answer is only valid for reference answer evals. Got: {evaluation_data_type.value}"
251            )
252        return self
def relationship_name() -> str:
713        def relationship_name_method() -> str:
714            return relationship_name

The type of the None singleton.

def parent_type() -> Type[kiln_ai.datamodel.basemodel.KilnParentModel]:
706        def parent_class_method() -> Type[KilnParentModel]:
707            return cls

The type of the None singleton.

model_config = {'validate_assignment': True}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

def model_post_init(self: pydantic.main.BaseModel, context: Any, /) -> None:
337def init_private_attributes(self: BaseModel, context: Any, /) -> None:
338    """This function is meant to behave like a BaseModel method to initialise private attributes.
339
340    It takes context as an argument since that's what pydantic-core passes when calling it.
341
342    Args:
343        self: The BaseModel instance.
344        context: The context.
345    """
346    if getattr(self, '__pydantic_private__', None) is None:
347        pydantic_private = {}
348        for name, private_attr in self.__private_attributes__.items():
349            default = private_attr.get_default()
350            if default is not PydanticUndefined:
351                pydantic_private[name] = default
352        object_setattr(self, '__pydantic_private__', pydantic_private)

This function is meant to behave like a BaseModel method to initialise private attributes.

It takes context as an argument since that's what pydantic-core passes when calling it.

Args: self: The BaseModel instance. context: The context.

class EvalConfig(kiln_ai.datamodel.basemodel.KilnParentedModel, kiln_ai.datamodel.basemodel.KilnParentModel):
255class EvalConfig(KilnParentedModel, KilnParentModel, parent_of={"runs": EvalRun}):
256    """
257    A configuration for running an eval. This includes anything needed to run the eval on a dataset like the prompt, model, thresholds, etc.
258
259    A eval might have many configs, example running the same eval with 2 different models. Comparing eval results is only valid within the scope of the same config.
260    """
261
262    name: FilenameString = Field(description="The name of the eval config.")
263    model_name: str = Field(
264        description="The name of the model to use for this eval config. ",
265    )
266    model_provider: str = Field(
267        description="The provider of the model to use for this eval config.",
268    )
269    config_type: EvalConfigType = Field(
270        default=EvalConfigType.g_eval,
271        description="This is used to determine the type of eval to run.",
272    )
273    properties: dict[str, Any] = Field(
274        default={},
275        description="Properties to be used to execute the eval config. This is config_type specific and should serialize to a json dict.",
276    )
277
278    def parent_eval(self) -> Union["Eval", None]:
279        if self.parent is not None and self.parent.__class__.__name__ != "Eval":
280            raise ValueError("parent must be an Eval")
281        return self.parent  # type: ignore
282
283    def runs(self, readonly: bool = False) -> list[EvalRun]:
284        return super().runs(readonly=readonly)  # type: ignore
285
286    @model_validator(mode="after")
287    def validate_properties(self) -> Self:
288        if (
289            self.config_type == EvalConfigType.g_eval
290            or self.config_type == EvalConfigType.llm_as_judge
291        ):
292            if "eval_steps" not in self.properties or not isinstance(
293                self.properties["eval_steps"], list
294            ):
295                raise ValueError("eval_steps is required and must be a list for g_eval")
296            if "task_description" in self.properties and not isinstance(
297                self.properties["task_description"], str
298            ):
299                raise ValueError(
300                    "task_description is optional, but if provided must be a string"
301                )
302            return self
303        else:
304            raise ValueError(f"Invalid eval config type: {self.config_type}")
305
306    @model_validator(mode="after")
307    def validate_json_serializable(self) -> "EvalConfig":
308        try:
309            # This will raise a TypeError if the dict contains non-JSON-serializable objects
310            json.dumps(self.properties)
311        except TypeError as e:
312            raise ValueError(f"Properties must be JSON serializable: {e!s}")
313        return self

A configuration for running an eval. This includes anything needed to run the eval on a dataset like the prompt, model, thresholds, etc.

A eval might have many configs, example running the same eval with 2 different models. Comparing eval results is only valid within the scope of the same config.

name: Annotated[str, BeforeValidator(func=<function name_validator.<locals>.fn at 0x7f2f1ec0c9a0>, json_schema_input_type=PydanticUndefined)]
model_name: str
model_provider: str
config_type: EvalConfigType
properties: dict[str, typing.Any]
def parent_eval(self) -> Optional[Eval]:
278    def parent_eval(self) -> Union["Eval", None]:
279        if self.parent is not None and self.parent.__class__.__name__ != "Eval":
280            raise ValueError("parent must be an Eval")
281        return self.parent  # type: ignore
def runs(self, readonly=False) -> List[EvalRun]:
695        def child_method(self, readonly: bool = False) -> list[child_class]:
696            return child_class.all_children_of_parent_path(self.path, readonly=readonly)

The type of the None singleton.

@model_validator(mode='after')
def validate_properties(self) -> Self:
286    @model_validator(mode="after")
287    def validate_properties(self) -> Self:
288        if (
289            self.config_type == EvalConfigType.g_eval
290            or self.config_type == EvalConfigType.llm_as_judge
291        ):
292            if "eval_steps" not in self.properties or not isinstance(
293                self.properties["eval_steps"], list
294            ):
295                raise ValueError("eval_steps is required and must be a list for g_eval")
296            if "task_description" in self.properties and not isinstance(
297                self.properties["task_description"], str
298            ):
299                raise ValueError(
300                    "task_description is optional, but if provided must be a string"
301                )
302            return self
303        else:
304            raise ValueError(f"Invalid eval config type: {self.config_type}")
@model_validator(mode='after')
def validate_json_serializable(self) -> EvalConfig:
306    @model_validator(mode="after")
307    def validate_json_serializable(self) -> "EvalConfig":
308        try:
309            # This will raise a TypeError if the dict contains non-JSON-serializable objects
310            json.dumps(self.properties)
311        except TypeError as e:
312            raise ValueError(f"Properties must be JSON serializable: {e!s}")
313        return self
def relationship_name() -> str:
713        def relationship_name_method() -> str:
714            return relationship_name

The type of the None singleton.

def parent_type() -> Type[kiln_ai.datamodel.basemodel.KilnParentModel]:
706        def parent_class_method() -> Type[KilnParentModel]:
707            return cls

The type of the None singleton.

model_config = {'validate_assignment': True}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

def model_post_init(self: pydantic.main.BaseModel, context: Any, /) -> None:
337def init_private_attributes(self: BaseModel, context: Any, /) -> None:
338    """This function is meant to behave like a BaseModel method to initialise private attributes.
339
340    It takes context as an argument since that's what pydantic-core passes when calling it.
341
342    Args:
343        self: The BaseModel instance.
344        context: The context.
345    """
346    if getattr(self, '__pydantic_private__', None) is None:
347        pydantic_private = {}
348        for name, private_attr in self.__private_attributes__.items():
349            default = private_attr.get_default()
350            if default is not PydanticUndefined:
351                pydantic_private[name] = default
352        object_setattr(self, '__pydantic_private__', pydantic_private)

This function is meant to behave like a BaseModel method to initialise private attributes.

It takes context as an argument since that's what pydantic-core passes when calling it.

Args: self: The BaseModel instance. context: The context.

class EvalDataType(builtins.str, enum.Enum):
316class EvalDataType(str, Enum):
317    final_answer = "final_answer"
318    full_trace = "full_trace"
319    reference_answer = "reference_answer"

str(object='') -> str str(bytes_or_buffer[, encoding[, errors]]) -> str

Create a new string object from the given object. If encoding or errors is specified, then the object must expose a data buffer that will be decoded using the given encoding and error handler. Otherwise, returns the result of object.__str__() (if defined) or repr(object). encoding defaults to 'utf-8'. errors defaults to 'strict'.

final_answer = <EvalDataType.final_answer: 'final_answer'>
full_trace = <EvalDataType.full_trace: 'full_trace'>
reference_answer = <EvalDataType.reference_answer: 'reference_answer'>
class Eval(kiln_ai.datamodel.basemodel.KilnParentedModel, kiln_ai.datamodel.basemodel.KilnParentModel):
322class Eval(KilnParentedModel, KilnParentModel, parent_of={"configs": EvalConfig}):
323    name: FilenameString = Field(description="The name of the eval.")
324    description: str | None = Field(
325        default=None, description="The description of the eval"
326    )
327    template: EvalTemplateId | None = Field(
328        default=None,
329        description="The template selected when creating this eval. Useful for suggesting eval steps and output scores.",
330    )
331    current_config_id: ID_TYPE = Field(
332        default=None,
333        description="The id of the current config to use for this eval. This can be changed over time to run the same eval with different configs.",
334    )
335    eval_set_filter_id: DatasetFilterId = Field(
336        description="The id of the dataset filter which defines which dataset items are included when running this eval. Should be mutually exclusive with eval_configs_filter_id."
337    )
338    eval_configs_filter_id: DatasetFilterId | None = Field(
339        default=None,
340        description="The id of the dataset filter which defines which dataset items are included when comparing the quality of the eval configs under this eval. Should consist of dataset items with ratings. Should be mutually exclusive with eval_set_filter_id.",
341    )
342    output_scores: List[EvalOutputScore] = Field(
343        description="The scores this evaluator should produce."
344    )
345    favourite: bool = Field(
346        default=False,
347        description="Whether this eval is a favourite of the user. Rendered as a star icon in the UI.",
348    )
349    template_properties: dict[str, str | int | bool | float] = Field(
350        default={},
351        description="Properties to be used to execute the eval. This is template_type specific and should serialize to a json dict.",
352    )
353    evaluation_data_type: EvalDataType = Field(
354        default=EvalDataType.final_answer,
355        description="The output of the task run to evaluate. Can be final answer or full trace.",
356    )
357
358    # Workaround to return typed parent without importing Task
359    def parent_task(self) -> Union["Task", None]:
360        if self.parent is not None and self.parent.__class__.__name__ != "Task":
361            raise ValueError("parent must be a Task")
362        return self.parent  # type: ignore
363
364    def configs(self, readonly: bool = False) -> list[EvalConfig]:
365        return super().configs(readonly=readonly)  # type: ignore
366
367    @model_validator(mode="after")
368    def upgrade_old_reference_answer_eval_config(self) -> Self:
369        """
370        Migration: Set the first judge config as the default for existing reference answer evals that don't have a current_config_id set.
371
372        For reference_answer evals that don't have a current_config_id set, this migration
373        will set the first config (by created_at) as the default.
374        """
375        if self.id is None:
376            return self
377
378        # Only run during file loading
379        if not self._loaded_from_file:
380            return self
381
382        # Skip if already migrated (has a current_config_id set)
383        if self.current_config_id is not None:
384            return self
385
386        # Only migrate reference_answer evals
387        if self.evaluation_data_type != EvalDataType.reference_answer:
388            return self
389
390        # Prevent recursion: self.configs() loads child files, which re-loads this parent
391        # (see basemodel.py where we iterate_children_paths_of_parent_path calls load_from_file)
392        # This causes the validator to run again, creating an infinite loop without this guard.
393        with _migration_lock:
394            if self.id in _currently_migrating_eval_ids:
395                return self
396            _currently_migrating_eval_ids.add(self.id)
397
398        try:
399            # Get the configs - these are loaded from child files
400            configs_list = self.configs(readonly=True)
401            if configs_list and len(configs_list) > 0:
402                # Sort by created_at to get the oldest (first created) config
403                sorted_configs = sorted(configs_list, key=lambda c: c.created_at)
404                self.current_config_id = sorted_configs[0].id
405        finally:
406            with _migration_lock:
407                _currently_migrating_eval_ids.discard(self.id)
408
409        return self
410
411    @model_validator(mode="after")
412    def validate_scores(self) -> Self:
413        if self.output_scores is None or len(self.output_scores) == 0:
414            raise ValueError(
415                "output_scores are required, and must have at least one score."
416            )
417
418        # check for duplicate names (once transformed to JSON keys)
419        output_score_keys = [score.json_key() for score in self.output_scores]
420        if len(output_score_keys) != len(set(output_score_keys)):
421            raise ValueError(
422                f"output_scores must have unique names (once transformed to JSON keys). Got: [{', '.join(output_score_keys)}]"
423            )
424        return self
425
426    @model_validator(mode="after")
427    def validate_template_properties(self) -> Self:
428        # eval_configs_filter_id is required for all templates except "rag"
429        if (
430            self.template is not EvalTemplateId.rag
431            and self.eval_configs_filter_id is None
432        ):
433            raise ValueError(
434                "eval_configs_filter_id is required for all templates except 'rag'"
435            )
436
437        # Check for properties that are required for the issue template
438        if self.template == EvalTemplateId.issue:
439            if "issue_prompt" not in self.template_properties or not isinstance(
440                self.template_properties["issue_prompt"], str
441            ):
442                raise ValueError("issue_prompt is required for issue template")
443            if "failure_example" in self.template_properties and not isinstance(
444                self.template_properties["failure_example"], str
445            ):
446                raise ValueError(
447                    "failure_example is optional for issue template, but if provided must be a string"
448                )
449            if "pass_example" in self.template_properties and not isinstance(
450                self.template_properties["pass_example"], str
451            ):
452                raise ValueError(
453                    "pass_example is optional for issue template, but if provided must be a string"
454                )
455
456        if self.template == EvalTemplateId.tool_call:
457            if self.evaluation_data_type != EvalDataType.full_trace:
458                raise ValueError(
459                    "tool_call template should have evaluation_data_type set to full_trace"
460                )
461            if (
462                "tool" not in self.template_properties
463                or not isinstance(self.template_properties["tool"], str)
464                or not self.template_properties["tool"].strip()
465            ):
466                raise ValueError("tool is required for tool call template")
467            if "tool_function_name" not in self.template_properties or not isinstance(
468                self.template_properties["tool_function_name"], str
469            ):
470                raise ValueError(
471                    "tool_function_name is required for tool call template"
472                )
473            if (
474                "appropriate_tool_use_guidelines" not in self.template_properties
475                or not isinstance(
476                    self.template_properties["appropriate_tool_use_guidelines"], str
477                )
478                or not self.template_properties[
479                    "appropriate_tool_use_guidelines"
480                ].strip()
481            ):
482                raise ValueError(
483                    "appropriate_tool_use_guidelines is required for tool call template"
484                )
485            if (
486                "inappropriate_tool_use_guidelines" in self.template_properties
487                and not isinstance(
488                    self.template_properties["inappropriate_tool_use_guidelines"], str
489                )
490            ):
491                raise ValueError(
492                    "inappropriate_tool_use_guidelines is optional for tool call template, but if provided must be a string"
493                )
494        return self

Base model for Kiln models that have a parent-child relationship. This base class is for child models.

This class provides functionality for managing hierarchical relationships between models, including parent reference handling and file system organization.

Attributes: parent (KilnBaseModel): Reference to the parent model instance. Not persisted, just in memory.

name: Annotated[str, BeforeValidator(func=<function name_validator.<locals>.fn at 0x7f2f1ec0c9a0>, json_schema_input_type=PydanticUndefined)]
description: str | None
template: EvalTemplateId | None
current_config_id: Optional[str]
eval_set_filter_id: Annotated[str, AfterValidator(func=<function <lambda> at 0x7f2f1dc63c40>)]
eval_configs_filter_id: Optional[Annotated[str, AfterValidator(func=<function <lambda> at 0x7f2f1dc63c40>)]]
output_scores: List[EvalOutputScore]
favourite: bool
template_properties: dict[str, str | int | bool | float]
evaluation_data_type: EvalDataType
def parent_task(self) -> Optional[kiln_ai.datamodel.Task]:
359    def parent_task(self) -> Union["Task", None]:
360        if self.parent is not None and self.parent.__class__.__name__ != "Task":
361            raise ValueError("parent must be a Task")
362        return self.parent  # type: ignore
def configs(self, readonly=False) -> List[EvalConfig]:
695        def child_method(self, readonly: bool = False) -> list[child_class]:
696            return child_class.all_children_of_parent_path(self.path, readonly=readonly)

The type of the None singleton.

@model_validator(mode='after')
def upgrade_old_reference_answer_eval_config(self) -> Self:
367    @model_validator(mode="after")
368    def upgrade_old_reference_answer_eval_config(self) -> Self:
369        """
370        Migration: Set the first judge config as the default for existing reference answer evals that don't have a current_config_id set.
371
372        For reference_answer evals that don't have a current_config_id set, this migration
373        will set the first config (by created_at) as the default.
374        """
375        if self.id is None:
376            return self
377
378        # Only run during file loading
379        if not self._loaded_from_file:
380            return self
381
382        # Skip if already migrated (has a current_config_id set)
383        if self.current_config_id is not None:
384            return self
385
386        # Only migrate reference_answer evals
387        if self.evaluation_data_type != EvalDataType.reference_answer:
388            return self
389
390        # Prevent recursion: self.configs() loads child files, which re-loads this parent
391        # (see basemodel.py where we iterate_children_paths_of_parent_path calls load_from_file)
392        # This causes the validator to run again, creating an infinite loop without this guard.
393        with _migration_lock:
394            if self.id in _currently_migrating_eval_ids:
395                return self
396            _currently_migrating_eval_ids.add(self.id)
397
398        try:
399            # Get the configs - these are loaded from child files
400            configs_list = self.configs(readonly=True)
401            if configs_list and len(configs_list) > 0:
402                # Sort by created_at to get the oldest (first created) config
403                sorted_configs = sorted(configs_list, key=lambda c: c.created_at)
404                self.current_config_id = sorted_configs[0].id
405        finally:
406            with _migration_lock:
407                _currently_migrating_eval_ids.discard(self.id)
408
409        return self

Migration: Set the first judge config as the default for existing reference answer evals that don't have a current_config_id set.

For reference_answer evals that don't have a current_config_id set, this migration will set the first config (by created_at) as the default.

@model_validator(mode='after')
def validate_scores(self) -> Self:
411    @model_validator(mode="after")
412    def validate_scores(self) -> Self:
413        if self.output_scores is None or len(self.output_scores) == 0:
414            raise ValueError(
415                "output_scores are required, and must have at least one score."
416            )
417
418        # check for duplicate names (once transformed to JSON keys)
419        output_score_keys = [score.json_key() for score in self.output_scores]
420        if len(output_score_keys) != len(set(output_score_keys)):
421            raise ValueError(
422                f"output_scores must have unique names (once transformed to JSON keys). Got: [{', '.join(output_score_keys)}]"
423            )
424        return self
@model_validator(mode='after')
def validate_template_properties(self) -> Self:
426    @model_validator(mode="after")
427    def validate_template_properties(self) -> Self:
428        # eval_configs_filter_id is required for all templates except "rag"
429        if (
430            self.template is not EvalTemplateId.rag
431            and self.eval_configs_filter_id is None
432        ):
433            raise ValueError(
434                "eval_configs_filter_id is required for all templates except 'rag'"
435            )
436
437        # Check for properties that are required for the issue template
438        if self.template == EvalTemplateId.issue:
439            if "issue_prompt" not in self.template_properties or not isinstance(
440                self.template_properties["issue_prompt"], str
441            ):
442                raise ValueError("issue_prompt is required for issue template")
443            if "failure_example" in self.template_properties and not isinstance(
444                self.template_properties["failure_example"], str
445            ):
446                raise ValueError(
447                    "failure_example is optional for issue template, but if provided must be a string"
448                )
449            if "pass_example" in self.template_properties and not isinstance(
450                self.template_properties["pass_example"], str
451            ):
452                raise ValueError(
453                    "pass_example is optional for issue template, but if provided must be a string"
454                )
455
456        if self.template == EvalTemplateId.tool_call:
457            if self.evaluation_data_type != EvalDataType.full_trace:
458                raise ValueError(
459                    "tool_call template should have evaluation_data_type set to full_trace"
460                )
461            if (
462                "tool" not in self.template_properties
463                or not isinstance(self.template_properties["tool"], str)
464                or not self.template_properties["tool"].strip()
465            ):
466                raise ValueError("tool is required for tool call template")
467            if "tool_function_name" not in self.template_properties or not isinstance(
468                self.template_properties["tool_function_name"], str
469            ):
470                raise ValueError(
471                    "tool_function_name is required for tool call template"
472                )
473            if (
474                "appropriate_tool_use_guidelines" not in self.template_properties
475                or not isinstance(
476                    self.template_properties["appropriate_tool_use_guidelines"], str
477                )
478                or not self.template_properties[
479                    "appropriate_tool_use_guidelines"
480                ].strip()
481            ):
482                raise ValueError(
483                    "appropriate_tool_use_guidelines is required for tool call template"
484                )
485            if (
486                "inappropriate_tool_use_guidelines" in self.template_properties
487                and not isinstance(
488                    self.template_properties["inappropriate_tool_use_guidelines"], str
489                )
490            ):
491                raise ValueError(
492                    "inappropriate_tool_use_guidelines is optional for tool call template, but if provided must be a string"
493                )
494        return self
def relationship_name() -> str:
713        def relationship_name_method() -> str:
714            return relationship_name

The type of the None singleton.

def parent_type() -> Type[kiln_ai.datamodel.basemodel.KilnParentModel]:
706        def parent_class_method() -> Type[KilnParentModel]:
707            return cls

The type of the None singleton.

model_config = {'validate_assignment': True}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

def model_post_init(self: pydantic.main.BaseModel, context: Any, /) -> None:
337def init_private_attributes(self: BaseModel, context: Any, /) -> None:
338    """This function is meant to behave like a BaseModel method to initialise private attributes.
339
340    It takes context as an argument since that's what pydantic-core passes when calling it.
341
342    Args:
343        self: The BaseModel instance.
344        context: The context.
345    """
346    if getattr(self, '__pydantic_private__', None) is None:
347        pydantic_private = {}
348        for name, private_attr in self.__private_attributes__.items():
349            default = private_attr.get_default()
350            if default is not PydanticUndefined:
351                pydantic_private[name] = default
352        object_setattr(self, '__pydantic_private__', pydantic_private)

This function is meant to behave like a BaseModel method to initialise private attributes.

It takes context as an argument since that's what pydantic-core passes when calling it.

Args: self: The BaseModel instance. context: The context.