kiln_ai.datamodel.eval

  1import json
  2from enum import Enum
  3from threading import Lock
  4from typing import TYPE_CHECKING, Any, Dict, List, Union
  5
  6from pydantic import BaseModel, Field, model_validator
  7from typing_extensions import Self
  8
  9from kiln_ai.datamodel.basemodel import (
 10    ID_TYPE,
 11    FilenameString,
 12    FilenameStringShort,
 13    KilnParentedModel,
 14    KilnParentModel,
 15)
 16from kiln_ai.datamodel.datamodel_enums import TaskOutputRatingType
 17from kiln_ai.datamodel.dataset_filters import DatasetFilterId
 18from kiln_ai.datamodel.json_schema import string_to_json_key
 19from kiln_ai.datamodel.task_run import Usage
 20from kiln_ai.utils.exhaustive_error import raise_exhaustive_enum_error
 21
 22if TYPE_CHECKING:
 23    from kiln_ai.datamodel.spec import Spec
 24    from kiln_ai.datamodel.task import Task
 25
 26EvalScores = Dict[str, float]
 27
 28# Module-level set to track evals currently being migrated (to prevent recursion)
 29# Protected by _migration_lock to ensure thread-safe access
 30_migration_lock = Lock()
 31_currently_migrating_eval_ids: set[ID_TYPE] = set()
 32
 33
 34class EvalTemplateId(str, Enum):
 35    """
 36    An eval template is a pre-defined eval that can be used as a starting point for a new eval.
 37    """
 38
 39    kiln_requirements = "kiln_requirements"
 40    desired_behaviour = "desired_behaviour"
 41    issue = "kiln_issue"
 42    tool_call = "tool_call"
 43    toxicity = "toxicity"
 44    bias = "bias"
 45    maliciousness = "maliciousness"
 46    factual_correctness = "factual_correctness"
 47    jailbreak = "jailbreak"
 48    rag = "rag"
 49
 50
 51class EvalConfigType(str, Enum):
 52    g_eval = "g_eval"
 53    llm_as_judge = "llm_as_judge"
 54
 55
 56class EvalOutputScore(BaseModel):
 57    """
 58    A definition of a score that an evaluator will produce.
 59
 60    Very similar to TaskRequirement, but conceptually different keeping in a separate models.
 61    """
 62
 63    name: FilenameStringShort = Field(
 64        description="The name of the score. Will be provided to the model so use a descriptive name. Should align to the model's TaskRequirement name if you want to use human evals to evaluate the evaluator's performance."
 65    )
 66    instruction: str | None = Field(
 67        default=None,
 68        description="A description of the score, used to help the model understand the goal of the score. Will be provided to evaluator models, so should be written for the model, not the team/user.",
 69    )
 70    type: TaskOutputRatingType = Field(
 71        description="The type of rating to use ('five_star', 'pass_fail', 'pass_fail_critical').",
 72    )
 73
 74    def json_key(self) -> str:
 75        """
 76        The JSON key for the score, used when running the evaluator with a LLM and we need JSON output.
 77
 78        For example, "Overall Rating" -> "overall_rating"
 79        """
 80        return string_to_json_key(self.name)
 81
 82    @model_validator(mode="after")
 83    def validate_type(self) -> Self:
 84        if self.type == TaskOutputRatingType.custom:
 85            raise ValueError(
 86                f"Custom scores are not supported in evaluators. Score '{self.name}' was set to a custom score."
 87            )
 88        return self
 89
 90
 91class EvalRun(KilnParentedModel):
 92    """
 93    The results of running an eval on a single dataset item.
 94
 95    This is a child of an EvalConfig, which specifies how the scores were generated.
 96
 97    Eval runs can be one of 2 types:
 98    1) eval_config_eval=False: we were evaluating a task run (a method of running the task). We get the task input from the dataset_id.input, run the task with the task_run_config, then ran the evaluator on that output. task_run_config_id must be set. The output saved in this model is the output of the task run.
 99    2) eval_config_eval=True: we were evaluating an eval config (a method of evaluating the task). We used the existing dataset item input/output, and ran the evaluator on it. task_run_config_id must be None. The input/output saved in this model is the input/output of the dataset item.
100    """
101
102    dataset_id: ID_TYPE = Field(
103        description="The ID of the dataset item that was used for this run. Must belong to the same Task as the grand-parent eval of this EvalRun."
104    )
105    task_run_config_id: ID_TYPE | None = Field(
106        description="The ID of the TaskRunConfig that was run, if this eval run was based on a task run. Must belong to the same Task as this eval. Can be None if this eval run is based on an eval config."
107    )
108    eval_config_eval: bool = Field(
109        description="Whether this eval run to evaluate the parent eval config (evaluating the config using an existing dataset item). If true, task_run_config_id must be None, as we're not running the task.",
110        default=False,
111    )
112    # These two may duplicate the dataset_id.input/output, but we're denormalizing intentionally.
113    input: str = Field(
114        description="The input to the task. JSON formatted for structured input, plaintext for unstructured input."
115    )
116    output: str = Field(
117        description="The output of the task. JSON formatted for structured output, plaintext for unstructured output."
118    )
119    reference_answer: str | None = Field(
120        default=None,
121        description="The reference answer for the input. JSON formatted for structured reference answer, plaintext for unstructured reference answer. Used for reference answer evals.",
122    )
123    intermediate_outputs: Dict[str, str] | None = Field(
124        default=None,
125        description="The intermediate outputs of the task (example, eval thinking).",
126    )
127    task_run_trace: str | None = Field(
128        default=None,
129        description="The JSON formatted trace of the task run that produced the output.",
130    )
131    scores: EvalScores = Field(
132        description="The output scores of the evaluator (aligning to those required by the grand-parent Eval this object is a child of)."
133    )
134    task_run_usage: Usage | None = Field(
135        default=None,
136        description="The usage of the task run that produced this eval run output (not the usage by the evaluation model).",
137    )
138
139    def parent_eval_config(self) -> Union["EvalConfig", None]:
140        if self.parent is not None and self.parent.__class__.__name__ != "EvalConfig":
141            raise ValueError("parent must be an EvalConfig")
142        return self.parent  # type: ignore
143
144    @model_validator(mode="after")
145    def validate_output_fields(self) -> Self:
146        parent_eval_config = self.parent_eval_config()
147        parent_eval = parent_eval_config.parent_eval() if parent_eval_config else None
148        if not parent_eval:
149            return self
150
151        evaluation_data_type = parent_eval.evaluation_data_type
152        if (
153            evaluation_data_type == EvalDataType.final_answer
154            and self.task_run_trace is not None
155        ):
156            raise ValueError("final_answer runs should not set trace")
157        elif (
158            not self.eval_config_eval
159            and evaluation_data_type == EvalDataType.full_trace
160            and self.task_run_trace is None
161        ):
162            raise ValueError("full_trace task run eval runs should include trace")
163
164        return self
165
166    @model_validator(mode="after")
167    def validate_eval_run_types(self) -> Self:
168        if self.eval_config_eval and self.task_run_config_id is not None:
169            raise ValueError(
170                "task_run_config_id must be None if eval_config_eval is true"
171            )
172        if not self.eval_config_eval and self.task_run_config_id is None:
173            raise ValueError(
174                "task_run_config_id must be set if eval_config_eval is false"
175            )
176        return self
177
178    @model_validator(mode="after")
179    def validate_scores(self) -> Self:
180        # We're checking the scores have the expected keys from the grand-parent eval
181        if self.scores is None or len(self.scores) == 0:
182            raise ValueError("scores are required, and must have at least one score.")
183
184        parent_eval_config = self.parent_eval_config()
185        eval = parent_eval_config.parent_eval() if parent_eval_config else None
186        if not eval:
187            # Can't validate without the grand-parent eval, allow it to be validated later
188            return self
189
190        output_score_keys = [score.json_key() for score in eval.output_scores]
191        if set(output_score_keys) != set(self.scores.keys()):
192            raise ValueError(
193                f"The scores produced by the evaluator must match the scores expected by the eval. Got: [{', '.join(self.scores.keys())}] and expected: [{', '.join(output_score_keys)}]"
194            )
195
196        # Check that each score is expected in this eval and the correct type
197        for output_score in eval.output_scores:
198            match output_score.type:
199                case TaskOutputRatingType.five_star:
200                    five_star_score = self.scores[output_score.json_key()]
201                    if (
202                        not isinstance(five_star_score, float)
203                        or five_star_score < 1.0
204                        or five_star_score > 5.0
205                    ):
206                        raise ValueError(
207                            f"Score {output_score.name} is a five_star rating and must be a float between 1.0 and 5.0 inclusive. Got: {five_star_score}"
208                        )
209                case TaskOutputRatingType.pass_fail:
210                    pass_fail_score = self.scores[output_score.json_key()]
211                    if (
212                        not isinstance(pass_fail_score, float)
213                        or pass_fail_score < 0.0
214                        or pass_fail_score > 1.0
215                    ):
216                        raise ValueError(
217                            f"Score {output_score.name} is a pass_fail rating and must be a float between 0.0 and 1.0 inclusive. Got: {pass_fail_score}"
218                        )
219                case TaskOutputRatingType.pass_fail_critical:
220                    pass_fail_critical_score = self.scores[output_score.json_key()]
221                    if (
222                        not isinstance(pass_fail_critical_score, float)
223                        or pass_fail_critical_score < -1.0
224                        or pass_fail_critical_score > 1.0
225                    ):
226                        raise ValueError(
227                            f"Score {output_score.name} is a pass_fail_critical rating and must be a float between -1.0 and 1.0 inclusive. Got: {pass_fail_critical_score}"
228                        )
229                case TaskOutputRatingType.custom:
230                    raise ValueError(
231                        f"Custom scores are not supported in evaluators. '{output_score.name}' was set to a custom score."
232                    )
233                case _:
234                    # Catch missing cases
235                    raise_exhaustive_enum_error(output_score.type)
236        return self
237
238    @model_validator(mode="after")
239    def validate_reference_answer(self) -> Self:
240        parent_eval_config = self.parent_eval_config()
241        parent_eval = parent_eval_config.parent_eval() if parent_eval_config else None
242        if not parent_eval:
243            # Can't validate without the grand-parent eval, allow it to be validated later
244            return self
245
246        evaluation_data_type = parent_eval.evaluation_data_type
247        if (
248            self.reference_answer is not None
249            and evaluation_data_type != EvalDataType.reference_answer
250        ):
251            raise ValueError(
252                f"reference_answer is only valid for reference answer evals. Got: {evaluation_data_type.value}"
253            )
254        return self
255
256
257class EvalConfig(KilnParentedModel, KilnParentModel, parent_of={"runs": EvalRun}):
258    """
259    A configuration for running an eval. This includes anything needed to run the eval on a dataset like the prompt, model, thresholds, etc.
260
261    A eval might have many configs, example running the same eval with 2 different models. Comparing eval results is only valid within the scope of the same config.
262    """
263
264    name: FilenameString = Field(description="The name of the eval config.")
265    model_name: str = Field(
266        description="The name of the model to use for this eval config. ",
267    )
268    model_provider: str = Field(
269        description="The provider of the model to use for this eval config.",
270    )
271    config_type: EvalConfigType = Field(
272        default=EvalConfigType.g_eval,
273        description="This is used to determine the type of eval to run.",
274    )
275    properties: dict[str, Any] = Field(
276        default={},
277        description="Properties to be used to execute the eval config. This is config_type specific and should serialize to a json dict.",
278    )
279
280    def parent_eval(self) -> Union["Eval", None]:
281        if self.parent is not None and self.parent.__class__.__name__ != "Eval":
282            raise ValueError("parent must be an Eval")
283        return self.parent  # type: ignore
284
285    def runs(self, readonly: bool = False) -> list[EvalRun]:
286        return super().runs(readonly=readonly)  # type: ignore
287
288    @model_validator(mode="after")
289    def validate_properties(self) -> Self:
290        if (
291            self.config_type == EvalConfigType.g_eval
292            or self.config_type == EvalConfigType.llm_as_judge
293        ):
294            if "eval_steps" not in self.properties or not isinstance(
295                self.properties["eval_steps"], list
296            ):
297                raise ValueError("eval_steps is required and must be a list for g_eval")
298            if "task_description" in self.properties and not isinstance(
299                self.properties["task_description"], str
300            ):
301                raise ValueError(
302                    "task_description is optional, but if provided must be a string"
303                )
304            return self
305        else:
306            raise ValueError(f"Invalid eval config type: {self.config_type}")
307
308    @model_validator(mode="after")
309    def validate_json_serializable(self) -> "EvalConfig":
310        try:
311            # This will raise a TypeError if the dict contains non-JSON-serializable objects
312            json.dumps(self.properties)
313        except TypeError as e:
314            raise ValueError(f"Properties must be JSON serializable: {e!s}")
315        return self
316
317
318class EvalDataType(str, Enum):
319    final_answer = "final_answer"
320    full_trace = "full_trace"
321    reference_answer = "reference_answer"
322
323
324class Eval(KilnParentedModel, KilnParentModel, parent_of={"configs": EvalConfig}):
325    name: FilenameString = Field(description="The name of the eval.")
326    description: str | None = Field(
327        default=None, description="The description of the eval"
328    )
329    template: EvalTemplateId | None = Field(
330        default=None,
331        description="The template selected when creating this eval. Useful for suggesting eval steps and output scores.",
332    )
333    current_config_id: ID_TYPE = Field(
334        default=None,
335        description="The id of the current config to use for this eval. This can be changed over time to run the same eval with different configs.",
336    )
337    eval_set_filter_id: DatasetFilterId = Field(
338        description="The id of the dataset filter which defines which dataset items are included when running this eval. Should be mutually exclusive with eval_configs_filter_id and train_set_filter_id."
339    )
340    eval_configs_filter_id: DatasetFilterId | None = Field(
341        default=None,
342        description="The id of the dataset filter which defines which dataset items are included when comparing the quality of the eval configs under this eval. Should consist of dataset items with ratings. Should be mutually exclusive with eval_set_filter_id.",
343    )
344    train_set_filter_id: DatasetFilterId | None = Field(
345        default=None,
346        description="The id of the dataset filter which defines which dataset items are included in the training set for fine-tuning. Should be mutually exclusive with eval_set_filter_id.",
347    )
348    output_scores: List[EvalOutputScore] = Field(
349        description="The scores this evaluator should produce."
350    )
351    favourite: bool = Field(
352        default=False,
353        description="Whether this eval is a favourite of the user. Rendered as a star icon in the UI.",
354    )
355    template_properties: dict[str, str | int | bool | float] | None = Field(
356        default=None,
357        description="Properties to be used to execute the eval. This is template_type specific and should serialize to a json dict.",
358    )
359    evaluation_data_type: EvalDataType = Field(
360        default=EvalDataType.final_answer,
361        description="The output of the task run to evaluate. Can be final answer or full trace.",
362    )
363
364    # Workaround to return typed parent without importing Task
365    def parent_task(self) -> Union["Task", None]:
366        if self.parent is not None and self.parent.__class__.__name__ != "Task":
367            raise ValueError("parent must be a Task")
368        return self.parent  # type: ignore
369
370    def configs(self, readonly: bool = False) -> list[EvalConfig]:
371        return super().configs(readonly=readonly)  # type: ignore
372
373    # Workaround to return typed parent without importing Spec
374    def associated_spec(self, readonly: bool = False) -> Union["Spec", None]:
375        """
376        Get the spec associated with this eval, if any.
377        Returns None for legacy evals that are not associated with a spec.
378        """
379
380        task = self.parent_task()
381        if not task or not self.id:
382            return None
383
384        specs = task.specs(readonly=readonly)
385        for spec in specs:
386            if spec.eval_id == self.id:
387                return spec
388        return None
389
390    @model_validator(mode="after")
391    def upgrade_old_reference_answer_eval_config(self) -> Self:
392        """
393        Migration: Set the first judge config as the default for existing reference answer evals that don't have a current_config_id set.
394
395        For reference_answer evals that don't have a current_config_id set, this migration
396        will set the first config (by created_at) as the default.
397        """
398        if self.id is None:
399            return self
400
401        # Only run during file loading
402        if not self._loaded_from_file:
403            return self
404
405        # Skip if already migrated (has a current_config_id set)
406        if self.current_config_id is not None:
407            return self
408
409        # Only migrate reference_answer evals
410        if self.evaluation_data_type != EvalDataType.reference_answer:
411            return self
412
413        # Prevent recursion: self.configs() loads child files, which re-loads this parent
414        # (see basemodel.py where we iterate_children_paths_of_parent_path calls load_from_file)
415        # This causes the validator to run again, creating an infinite loop without this guard.
416        with _migration_lock:
417            if self.id in _currently_migrating_eval_ids:
418                return self
419            _currently_migrating_eval_ids.add(self.id)
420
421        try:
422            # Get the configs - these are loaded from child files
423            configs_list = self.configs(readonly=True)
424            if configs_list and len(configs_list) > 0:
425                # Sort by created_at to get the oldest (first created) config
426                sorted_configs = sorted(configs_list, key=lambda c: c.created_at)
427                self.current_config_id = sorted_configs[0].id
428        finally:
429            with _migration_lock:
430                _currently_migrating_eval_ids.discard(self.id)
431
432        return self
433
434    @model_validator(mode="after")
435    def migrate_train_set_filter_id(self) -> Self:
436        """
437        Migration: Auto-create a train_set_filter_id for legacy evals that don't have one.
438
439        Generates a tag-based filter ID from the eval name following the convention
440        used by spec-based evals (e.g., "train_{name_slug}").
441        """
442        if self.id is None:
443            return self
444
445        if not self._loaded_from_file:
446            return self
447
448        if self.train_set_filter_id is not None:
449            return self
450
451        tag_suffix = self.name.lower().replace(" ", "_")
452        self.train_set_filter_id = f"tag::train_{tag_suffix}"
453        return self
454
455    @model_validator(mode="after")
456    def validate_scores(self) -> Self:
457        if self.output_scores is None or len(self.output_scores) == 0:
458            raise ValueError(
459                "output_scores are required, and must have at least one score."
460            )
461
462        # check for duplicate names (once transformed to JSON keys)
463        output_score_keys = [score.json_key() for score in self.output_scores]
464        if len(output_score_keys) != len(set(output_score_keys)):
465            raise ValueError(
466                f"output_scores must have unique names (once transformed to JSON keys). Got: [{', '.join(output_score_keys)}]"
467            )
468        return self
469
470    @model_validator(mode="after")
471    def validate_template_properties(self) -> Self:
472        # eval_configs_filter_id is required for all templates except "rag"
473        if (
474            self.template is not EvalTemplateId.rag
475            and self.eval_configs_filter_id is None
476        ):
477            raise ValueError(
478                "eval_configs_filter_id is required for all templates except 'rag'"
479            )
480
481        # For spec-based evals, template_properties will be None and validation happens in the spec
482        # For legacy evals, template_properties contains the data and we validate here
483        if self.template_properties is None:
484            return self
485
486        # Check for properties that are required for the issue template (legacy evals only)
487        if self.template == EvalTemplateId.issue:
488            if "issue_prompt" not in self.template_properties or not isinstance(
489                self.template_properties["issue_prompt"], str
490            ):
491                raise ValueError("issue_prompt is required for issue template")
492            if "failure_example" in self.template_properties and not isinstance(
493                self.template_properties["failure_example"], str
494            ):
495                raise ValueError(
496                    "failure_example is optional for issue template, but if provided must be a string"
497                )
498            if "pass_example" in self.template_properties and not isinstance(
499                self.template_properties["pass_example"], str
500            ):
501                raise ValueError(
502                    "pass_example is optional for issue template, but if provided must be a string"
503                )
504
505        if self.template == EvalTemplateId.tool_call:
506            if self.evaluation_data_type != EvalDataType.full_trace:
507                raise ValueError(
508                    "tool_call template should have evaluation_data_type set to full_trace"
509                )
510            if (
511                "tool" not in self.template_properties
512                or not isinstance(self.template_properties["tool"], str)
513                or not self.template_properties["tool"].strip()
514            ):
515                raise ValueError("tool is required for tool call template")
516            if "tool_function_name" not in self.template_properties or not isinstance(
517                self.template_properties["tool_function_name"], str
518            ):
519                raise ValueError(
520                    "tool_function_name is required for tool call template"
521                )
522            if (
523                "appropriate_tool_use_guidelines" not in self.template_properties
524                or not isinstance(
525                    self.template_properties["appropriate_tool_use_guidelines"], str
526                )
527                or not self.template_properties[
528                    "appropriate_tool_use_guidelines"
529                ].strip()
530            ):
531                raise ValueError(
532                    "appropriate_tool_use_guidelines is required for tool call template"
533                )
534            if (
535                "inappropriate_tool_use_guidelines" in self.template_properties
536                and not isinstance(
537                    self.template_properties["inappropriate_tool_use_guidelines"], str
538                )
539            ):
540                raise ValueError(
541                    "inappropriate_tool_use_guidelines is optional for tool call template, but if provided must be a string"
542                )
543        return self
EvalScores = typing.Dict[str, float]
class EvalTemplateId(builtins.str, enum.Enum):
35class EvalTemplateId(str, Enum):
36    """
37    An eval template is a pre-defined eval that can be used as a starting point for a new eval.
38    """
39
40    kiln_requirements = "kiln_requirements"
41    desired_behaviour = "desired_behaviour"
42    issue = "kiln_issue"
43    tool_call = "tool_call"
44    toxicity = "toxicity"
45    bias = "bias"
46    maliciousness = "maliciousness"
47    factual_correctness = "factual_correctness"
48    jailbreak = "jailbreak"
49    rag = "rag"

An eval template is a pre-defined eval that can be used as a starting point for a new eval.

kiln_requirements = <EvalTemplateId.kiln_requirements: 'kiln_requirements'>
desired_behaviour = <EvalTemplateId.desired_behaviour: 'desired_behaviour'>
issue = <EvalTemplateId.issue: 'kiln_issue'>
tool_call = <EvalTemplateId.tool_call: 'tool_call'>
toxicity = <EvalTemplateId.toxicity: 'toxicity'>
bias = <EvalTemplateId.bias: 'bias'>
maliciousness = <EvalTemplateId.maliciousness: 'maliciousness'>
factual_correctness = <EvalTemplateId.factual_correctness: 'factual_correctness'>
jailbreak = <EvalTemplateId.jailbreak: 'jailbreak'>
rag = <EvalTemplateId.rag: 'rag'>
class EvalConfigType(builtins.str, enum.Enum):
52class EvalConfigType(str, Enum):
53    g_eval = "g_eval"
54    llm_as_judge = "llm_as_judge"

str(object='') -> str str(bytes_or_buffer[, encoding[, errors]]) -> str

Create a new string object from the given object. If encoding or errors is specified, then the object must expose a data buffer that will be decoded using the given encoding and error handler. Otherwise, returns the result of object.__str__() (if defined) or repr(object). encoding defaults to 'utf-8'. errors defaults to 'strict'.

g_eval = <EvalConfigType.g_eval: 'g_eval'>
llm_as_judge = <EvalConfigType.llm_as_judge: 'llm_as_judge'>
class EvalOutputScore(pydantic.main.BaseModel):
57class EvalOutputScore(BaseModel):
58    """
59    A definition of a score that an evaluator will produce.
60
61    Very similar to TaskRequirement, but conceptually different keeping in a separate models.
62    """
63
64    name: FilenameStringShort = Field(
65        description="The name of the score. Will be provided to the model so use a descriptive name. Should align to the model's TaskRequirement name if you want to use human evals to evaluate the evaluator's performance."
66    )
67    instruction: str | None = Field(
68        default=None,
69        description="A description of the score, used to help the model understand the goal of the score. Will be provided to evaluator models, so should be written for the model, not the team/user.",
70    )
71    type: TaskOutputRatingType = Field(
72        description="The type of rating to use ('five_star', 'pass_fail', 'pass_fail_critical').",
73    )
74
75    def json_key(self) -> str:
76        """
77        The JSON key for the score, used when running the evaluator with a LLM and we need JSON output.
78
79        For example, "Overall Rating" -> "overall_rating"
80        """
81        return string_to_json_key(self.name)
82
83    @model_validator(mode="after")
84    def validate_type(self) -> Self:
85        if self.type == TaskOutputRatingType.custom:
86            raise ValueError(
87                f"Custom scores are not supported in evaluators. Score '{self.name}' was set to a custom score."
88            )
89        return self

A definition of a score that an evaluator will produce.

Very similar to TaskRequirement, but conceptually different keeping in a separate models.

name: Annotated[str, BeforeValidator(func=<function name_validator.<locals>.fn at 0x7f5e59c4e520>, json_schema_input_type=PydanticUndefined)]
instruction: str | None
def json_key(self) -> str:
75    def json_key(self) -> str:
76        """
77        The JSON key for the score, used when running the evaluator with a LLM and we need JSON output.
78
79        For example, "Overall Rating" -> "overall_rating"
80        """
81        return string_to_json_key(self.name)

The JSON key for the score, used when running the evaluator with a LLM and we need JSON output.

For example, "Overall Rating" -> "overall_rating"

@model_validator(mode='after')
def validate_type(self) -> Self:
83    @model_validator(mode="after")
84    def validate_type(self) -> Self:
85        if self.type == TaskOutputRatingType.custom:
86            raise ValueError(
87                f"Custom scores are not supported in evaluators. Score '{self.name}' was set to a custom score."
88            )
89        return self
model_config: ClassVar[pydantic.config.ConfigDict] = {}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

class EvalRun(kiln_ai.datamodel.basemodel.KilnParentedModel):
 92class EvalRun(KilnParentedModel):
 93    """
 94    The results of running an eval on a single dataset item.
 95
 96    This is a child of an EvalConfig, which specifies how the scores were generated.
 97
 98    Eval runs can be one of 2 types:
 99    1) eval_config_eval=False: we were evaluating a task run (a method of running the task). We get the task input from the dataset_id.input, run the task with the task_run_config, then ran the evaluator on that output. task_run_config_id must be set. The output saved in this model is the output of the task run.
100    2) eval_config_eval=True: we were evaluating an eval config (a method of evaluating the task). We used the existing dataset item input/output, and ran the evaluator on it. task_run_config_id must be None. The input/output saved in this model is the input/output of the dataset item.
101    """
102
103    dataset_id: ID_TYPE = Field(
104        description="The ID of the dataset item that was used for this run. Must belong to the same Task as the grand-parent eval of this EvalRun."
105    )
106    task_run_config_id: ID_TYPE | None = Field(
107        description="The ID of the TaskRunConfig that was run, if this eval run was based on a task run. Must belong to the same Task as this eval. Can be None if this eval run is based on an eval config."
108    )
109    eval_config_eval: bool = Field(
110        description="Whether this eval run to evaluate the parent eval config (evaluating the config using an existing dataset item). If true, task_run_config_id must be None, as we're not running the task.",
111        default=False,
112    )
113    # These two may duplicate the dataset_id.input/output, but we're denormalizing intentionally.
114    input: str = Field(
115        description="The input to the task. JSON formatted for structured input, plaintext for unstructured input."
116    )
117    output: str = Field(
118        description="The output of the task. JSON formatted for structured output, plaintext for unstructured output."
119    )
120    reference_answer: str | None = Field(
121        default=None,
122        description="The reference answer for the input. JSON formatted for structured reference answer, plaintext for unstructured reference answer. Used for reference answer evals.",
123    )
124    intermediate_outputs: Dict[str, str] | None = Field(
125        default=None,
126        description="The intermediate outputs of the task (example, eval thinking).",
127    )
128    task_run_trace: str | None = Field(
129        default=None,
130        description="The JSON formatted trace of the task run that produced the output.",
131    )
132    scores: EvalScores = Field(
133        description="The output scores of the evaluator (aligning to those required by the grand-parent Eval this object is a child of)."
134    )
135    task_run_usage: Usage | None = Field(
136        default=None,
137        description="The usage of the task run that produced this eval run output (not the usage by the evaluation model).",
138    )
139
140    def parent_eval_config(self) -> Union["EvalConfig", None]:
141        if self.parent is not None and self.parent.__class__.__name__ != "EvalConfig":
142            raise ValueError("parent must be an EvalConfig")
143        return self.parent  # type: ignore
144
145    @model_validator(mode="after")
146    def validate_output_fields(self) -> Self:
147        parent_eval_config = self.parent_eval_config()
148        parent_eval = parent_eval_config.parent_eval() if parent_eval_config else None
149        if not parent_eval:
150            return self
151
152        evaluation_data_type = parent_eval.evaluation_data_type
153        if (
154            evaluation_data_type == EvalDataType.final_answer
155            and self.task_run_trace is not None
156        ):
157            raise ValueError("final_answer runs should not set trace")
158        elif (
159            not self.eval_config_eval
160            and evaluation_data_type == EvalDataType.full_trace
161            and self.task_run_trace is None
162        ):
163            raise ValueError("full_trace task run eval runs should include trace")
164
165        return self
166
167    @model_validator(mode="after")
168    def validate_eval_run_types(self) -> Self:
169        if self.eval_config_eval and self.task_run_config_id is not None:
170            raise ValueError(
171                "task_run_config_id must be None if eval_config_eval is true"
172            )
173        if not self.eval_config_eval and self.task_run_config_id is None:
174            raise ValueError(
175                "task_run_config_id must be set if eval_config_eval is false"
176            )
177        return self
178
179    @model_validator(mode="after")
180    def validate_scores(self) -> Self:
181        # We're checking the scores have the expected keys from the grand-parent eval
182        if self.scores is None or len(self.scores) == 0:
183            raise ValueError("scores are required, and must have at least one score.")
184
185        parent_eval_config = self.parent_eval_config()
186        eval = parent_eval_config.parent_eval() if parent_eval_config else None
187        if not eval:
188            # Can't validate without the grand-parent eval, allow it to be validated later
189            return self
190
191        output_score_keys = [score.json_key() for score in eval.output_scores]
192        if set(output_score_keys) != set(self.scores.keys()):
193            raise ValueError(
194                f"The scores produced by the evaluator must match the scores expected by the eval. Got: [{', '.join(self.scores.keys())}] and expected: [{', '.join(output_score_keys)}]"
195            )
196
197        # Check that each score is expected in this eval and the correct type
198        for output_score in eval.output_scores:
199            match output_score.type:
200                case TaskOutputRatingType.five_star:
201                    five_star_score = self.scores[output_score.json_key()]
202                    if (
203                        not isinstance(five_star_score, float)
204                        or five_star_score < 1.0
205                        or five_star_score > 5.0
206                    ):
207                        raise ValueError(
208                            f"Score {output_score.name} is a five_star rating and must be a float between 1.0 and 5.0 inclusive. Got: {five_star_score}"
209                        )
210                case TaskOutputRatingType.pass_fail:
211                    pass_fail_score = self.scores[output_score.json_key()]
212                    if (
213                        not isinstance(pass_fail_score, float)
214                        or pass_fail_score < 0.0
215                        or pass_fail_score > 1.0
216                    ):
217                        raise ValueError(
218                            f"Score {output_score.name} is a pass_fail rating and must be a float between 0.0 and 1.0 inclusive. Got: {pass_fail_score}"
219                        )
220                case TaskOutputRatingType.pass_fail_critical:
221                    pass_fail_critical_score = self.scores[output_score.json_key()]
222                    if (
223                        not isinstance(pass_fail_critical_score, float)
224                        or pass_fail_critical_score < -1.0
225                        or pass_fail_critical_score > 1.0
226                    ):
227                        raise ValueError(
228                            f"Score {output_score.name} is a pass_fail_critical rating and must be a float between -1.0 and 1.0 inclusive. Got: {pass_fail_critical_score}"
229                        )
230                case TaskOutputRatingType.custom:
231                    raise ValueError(
232                        f"Custom scores are not supported in evaluators. '{output_score.name}' was set to a custom score."
233                    )
234                case _:
235                    # Catch missing cases
236                    raise_exhaustive_enum_error(output_score.type)
237        return self
238
239    @model_validator(mode="after")
240    def validate_reference_answer(self) -> Self:
241        parent_eval_config = self.parent_eval_config()
242        parent_eval = parent_eval_config.parent_eval() if parent_eval_config else None
243        if not parent_eval:
244            # Can't validate without the grand-parent eval, allow it to be validated later
245            return self
246
247        evaluation_data_type = parent_eval.evaluation_data_type
248        if (
249            self.reference_answer is not None
250            and evaluation_data_type != EvalDataType.reference_answer
251        ):
252            raise ValueError(
253                f"reference_answer is only valid for reference answer evals. Got: {evaluation_data_type.value}"
254            )
255        return self

The results of running an eval on a single dataset item.

This is a child of an EvalConfig, which specifies how the scores were generated.

Eval runs can be one of 2 types: 1) eval_config_eval=False: we were evaluating a task run (a method of running the task). We get the task input from the dataset_id.input, run the task with the task_run_config, then ran the evaluator on that output. task_run_config_id must be set. The output saved in this model is the output of the task run. 2) eval_config_eval=True: we were evaluating an eval config (a method of evaluating the task). We used the existing dataset item input/output, and ran the evaluator on it. task_run_config_id must be None. The input/output saved in this model is the input/output of the dataset item.

dataset_id: Optional[str]
task_run_config_id: Optional[str]
eval_config_eval: bool
input: str
output: str
reference_answer: str | None
intermediate_outputs: Optional[Dict[str, str]]
task_run_trace: str | None
scores: Dict[str, float]
task_run_usage: kiln_ai.datamodel.Usage | None
def parent_eval_config(self) -> Optional[EvalConfig]:
140    def parent_eval_config(self) -> Union["EvalConfig", None]:
141        if self.parent is not None and self.parent.__class__.__name__ != "EvalConfig":
142            raise ValueError("parent must be an EvalConfig")
143        return self.parent  # type: ignore
@model_validator(mode='after')
def validate_output_fields(self) -> Self:
145    @model_validator(mode="after")
146    def validate_output_fields(self) -> Self:
147        parent_eval_config = self.parent_eval_config()
148        parent_eval = parent_eval_config.parent_eval() if parent_eval_config else None
149        if not parent_eval:
150            return self
151
152        evaluation_data_type = parent_eval.evaluation_data_type
153        if (
154            evaluation_data_type == EvalDataType.final_answer
155            and self.task_run_trace is not None
156        ):
157            raise ValueError("final_answer runs should not set trace")
158        elif (
159            not self.eval_config_eval
160            and evaluation_data_type == EvalDataType.full_trace
161            and self.task_run_trace is None
162        ):
163            raise ValueError("full_trace task run eval runs should include trace")
164
165        return self
@model_validator(mode='after')
def validate_eval_run_types(self) -> Self:
167    @model_validator(mode="after")
168    def validate_eval_run_types(self) -> Self:
169        if self.eval_config_eval and self.task_run_config_id is not None:
170            raise ValueError(
171                "task_run_config_id must be None if eval_config_eval is true"
172            )
173        if not self.eval_config_eval and self.task_run_config_id is None:
174            raise ValueError(
175                "task_run_config_id must be set if eval_config_eval is false"
176            )
177        return self
@model_validator(mode='after')
def validate_scores(self) -> Self:
179    @model_validator(mode="after")
180    def validate_scores(self) -> Self:
181        # We're checking the scores have the expected keys from the grand-parent eval
182        if self.scores is None or len(self.scores) == 0:
183            raise ValueError("scores are required, and must have at least one score.")
184
185        parent_eval_config = self.parent_eval_config()
186        eval = parent_eval_config.parent_eval() if parent_eval_config else None
187        if not eval:
188            # Can't validate without the grand-parent eval, allow it to be validated later
189            return self
190
191        output_score_keys = [score.json_key() for score in eval.output_scores]
192        if set(output_score_keys) != set(self.scores.keys()):
193            raise ValueError(
194                f"The scores produced by the evaluator must match the scores expected by the eval. Got: [{', '.join(self.scores.keys())}] and expected: [{', '.join(output_score_keys)}]"
195            )
196
197        # Check that each score is expected in this eval and the correct type
198        for output_score in eval.output_scores:
199            match output_score.type:
200                case TaskOutputRatingType.five_star:
201                    five_star_score = self.scores[output_score.json_key()]
202                    if (
203                        not isinstance(five_star_score, float)
204                        or five_star_score < 1.0
205                        or five_star_score > 5.0
206                    ):
207                        raise ValueError(
208                            f"Score {output_score.name} is a five_star rating and must be a float between 1.0 and 5.0 inclusive. Got: {five_star_score}"
209                        )
210                case TaskOutputRatingType.pass_fail:
211                    pass_fail_score = self.scores[output_score.json_key()]
212                    if (
213                        not isinstance(pass_fail_score, float)
214                        or pass_fail_score < 0.0
215                        or pass_fail_score > 1.0
216                    ):
217                        raise ValueError(
218                            f"Score {output_score.name} is a pass_fail rating and must be a float between 0.0 and 1.0 inclusive. Got: {pass_fail_score}"
219                        )
220                case TaskOutputRatingType.pass_fail_critical:
221                    pass_fail_critical_score = self.scores[output_score.json_key()]
222                    if (
223                        not isinstance(pass_fail_critical_score, float)
224                        or pass_fail_critical_score < -1.0
225                        or pass_fail_critical_score > 1.0
226                    ):
227                        raise ValueError(
228                            f"Score {output_score.name} is a pass_fail_critical rating and must be a float between -1.0 and 1.0 inclusive. Got: {pass_fail_critical_score}"
229                        )
230                case TaskOutputRatingType.custom:
231                    raise ValueError(
232                        f"Custom scores are not supported in evaluators. '{output_score.name}' was set to a custom score."
233                    )
234                case _:
235                    # Catch missing cases
236                    raise_exhaustive_enum_error(output_score.type)
237        return self
@model_validator(mode='after')
def validate_reference_answer(self) -> Self:
239    @model_validator(mode="after")
240    def validate_reference_answer(self) -> Self:
241        parent_eval_config = self.parent_eval_config()
242        parent_eval = parent_eval_config.parent_eval() if parent_eval_config else None
243        if not parent_eval:
244            # Can't validate without the grand-parent eval, allow it to be validated later
245            return self
246
247        evaluation_data_type = parent_eval.evaluation_data_type
248        if (
249            self.reference_answer is not None
250            and evaluation_data_type != EvalDataType.reference_answer
251        ):
252            raise ValueError(
253                f"reference_answer is only valid for reference answer evals. Got: {evaluation_data_type.value}"
254            )
255        return self
def relationship_name() -> str:
727        def relationship_name_method() -> str:
728            return relationship_name

The type of the None singleton.

def parent_type() -> Type[kiln_ai.datamodel.basemodel.KilnParentModel]:
720        def parent_class_method() -> Type[KilnParentModel]:
721            return cls

The type of the None singleton.

model_config = {'validate_assignment': True}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

def model_post_init(self: pydantic.main.BaseModel, context: Any, /) -> None:
337def init_private_attributes(self: BaseModel, context: Any, /) -> None:
338    """This function is meant to behave like a BaseModel method to initialise private attributes.
339
340    It takes context as an argument since that's what pydantic-core passes when calling it.
341
342    Args:
343        self: The BaseModel instance.
344        context: The context.
345    """
346    if getattr(self, '__pydantic_private__', None) is None:
347        pydantic_private = {}
348        for name, private_attr in self.__private_attributes__.items():
349            default = private_attr.get_default()
350            if default is not PydanticUndefined:
351                pydantic_private[name] = default
352        object_setattr(self, '__pydantic_private__', pydantic_private)

This function is meant to behave like a BaseModel method to initialise private attributes.

It takes context as an argument since that's what pydantic-core passes when calling it.

Args: self: The BaseModel instance. context: The context.

class EvalConfig(kiln_ai.datamodel.basemodel.KilnParentedModel, kiln_ai.datamodel.basemodel.KilnParentModel):
258class EvalConfig(KilnParentedModel, KilnParentModel, parent_of={"runs": EvalRun}):
259    """
260    A configuration for running an eval. This includes anything needed to run the eval on a dataset like the prompt, model, thresholds, etc.
261
262    A eval might have many configs, example running the same eval with 2 different models. Comparing eval results is only valid within the scope of the same config.
263    """
264
265    name: FilenameString = Field(description="The name of the eval config.")
266    model_name: str = Field(
267        description="The name of the model to use for this eval config. ",
268    )
269    model_provider: str = Field(
270        description="The provider of the model to use for this eval config.",
271    )
272    config_type: EvalConfigType = Field(
273        default=EvalConfigType.g_eval,
274        description="This is used to determine the type of eval to run.",
275    )
276    properties: dict[str, Any] = Field(
277        default={},
278        description="Properties to be used to execute the eval config. This is config_type specific and should serialize to a json dict.",
279    )
280
281    def parent_eval(self) -> Union["Eval", None]:
282        if self.parent is not None and self.parent.__class__.__name__ != "Eval":
283            raise ValueError("parent must be an Eval")
284        return self.parent  # type: ignore
285
286    def runs(self, readonly: bool = False) -> list[EvalRun]:
287        return super().runs(readonly=readonly)  # type: ignore
288
289    @model_validator(mode="after")
290    def validate_properties(self) -> Self:
291        if (
292            self.config_type == EvalConfigType.g_eval
293            or self.config_type == EvalConfigType.llm_as_judge
294        ):
295            if "eval_steps" not in self.properties or not isinstance(
296                self.properties["eval_steps"], list
297            ):
298                raise ValueError("eval_steps is required and must be a list for g_eval")
299            if "task_description" in self.properties and not isinstance(
300                self.properties["task_description"], str
301            ):
302                raise ValueError(
303                    "task_description is optional, but if provided must be a string"
304                )
305            return self
306        else:
307            raise ValueError(f"Invalid eval config type: {self.config_type}")
308
309    @model_validator(mode="after")
310    def validate_json_serializable(self) -> "EvalConfig":
311        try:
312            # This will raise a TypeError if the dict contains non-JSON-serializable objects
313            json.dumps(self.properties)
314        except TypeError as e:
315            raise ValueError(f"Properties must be JSON serializable: {e!s}")
316        return self

A configuration for running an eval. This includes anything needed to run the eval on a dataset like the prompt, model, thresholds, etc.

A eval might have many configs, example running the same eval with 2 different models. Comparing eval results is only valid within the scope of the same config.

name: Annotated[str, BeforeValidator(func=<function name_validator.<locals>.fn at 0x7f5e59c4d300>, json_schema_input_type=PydanticUndefined)]
model_name: str
model_provider: str
config_type: EvalConfigType
properties: dict[str, typing.Any]
def parent_eval(self) -> Optional[Eval]:
281    def parent_eval(self) -> Union["Eval", None]:
282        if self.parent is not None and self.parent.__class__.__name__ != "Eval":
283            raise ValueError("parent must be an Eval")
284        return self.parent  # type: ignore
def runs(self, readonly=False) -> List[EvalRun]:
709        def child_method(self, readonly: bool = False) -> list[child_class]:  # type: ignore[invalid-type-form]
710            return child_class.all_children_of_parent_path(self.path, readonly=readonly)

The type of the None singleton.

@model_validator(mode='after')
def validate_properties(self) -> Self:
289    @model_validator(mode="after")
290    def validate_properties(self) -> Self:
291        if (
292            self.config_type == EvalConfigType.g_eval
293            or self.config_type == EvalConfigType.llm_as_judge
294        ):
295            if "eval_steps" not in self.properties or not isinstance(
296                self.properties["eval_steps"], list
297            ):
298                raise ValueError("eval_steps is required and must be a list for g_eval")
299            if "task_description" in self.properties and not isinstance(
300                self.properties["task_description"], str
301            ):
302                raise ValueError(
303                    "task_description is optional, but if provided must be a string"
304                )
305            return self
306        else:
307            raise ValueError(f"Invalid eval config type: {self.config_type}")
@model_validator(mode='after')
def validate_json_serializable(self) -> EvalConfig:
309    @model_validator(mode="after")
310    def validate_json_serializable(self) -> "EvalConfig":
311        try:
312            # This will raise a TypeError if the dict contains non-JSON-serializable objects
313            json.dumps(self.properties)
314        except TypeError as e:
315            raise ValueError(f"Properties must be JSON serializable: {e!s}")
316        return self
def relationship_name() -> str:
727        def relationship_name_method() -> str:
728            return relationship_name

The type of the None singleton.

def parent_type() -> Type[kiln_ai.datamodel.basemodel.KilnParentModel]:
720        def parent_class_method() -> Type[KilnParentModel]:
721            return cls

The type of the None singleton.

model_config = {'validate_assignment': True}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

def model_post_init(self: pydantic.main.BaseModel, context: Any, /) -> None:
337def init_private_attributes(self: BaseModel, context: Any, /) -> None:
338    """This function is meant to behave like a BaseModel method to initialise private attributes.
339
340    It takes context as an argument since that's what pydantic-core passes when calling it.
341
342    Args:
343        self: The BaseModel instance.
344        context: The context.
345    """
346    if getattr(self, '__pydantic_private__', None) is None:
347        pydantic_private = {}
348        for name, private_attr in self.__private_attributes__.items():
349            default = private_attr.get_default()
350            if default is not PydanticUndefined:
351                pydantic_private[name] = default
352        object_setattr(self, '__pydantic_private__', pydantic_private)

This function is meant to behave like a BaseModel method to initialise private attributes.

It takes context as an argument since that's what pydantic-core passes when calling it.

Args: self: The BaseModel instance. context: The context.

class EvalDataType(builtins.str, enum.Enum):
319class EvalDataType(str, Enum):
320    final_answer = "final_answer"
321    full_trace = "full_trace"
322    reference_answer = "reference_answer"

str(object='') -> str str(bytes_or_buffer[, encoding[, errors]]) -> str

Create a new string object from the given object. If encoding or errors is specified, then the object must expose a data buffer that will be decoded using the given encoding and error handler. Otherwise, returns the result of object.__str__() (if defined) or repr(object). encoding defaults to 'utf-8'. errors defaults to 'strict'.

final_answer = <EvalDataType.final_answer: 'final_answer'>
full_trace = <EvalDataType.full_trace: 'full_trace'>
reference_answer = <EvalDataType.reference_answer: 'reference_answer'>
class Eval(kiln_ai.datamodel.basemodel.KilnParentedModel, kiln_ai.datamodel.basemodel.KilnParentModel):
325class Eval(KilnParentedModel, KilnParentModel, parent_of={"configs": EvalConfig}):
326    name: FilenameString = Field(description="The name of the eval.")
327    description: str | None = Field(
328        default=None, description="The description of the eval"
329    )
330    template: EvalTemplateId | None = Field(
331        default=None,
332        description="The template selected when creating this eval. Useful for suggesting eval steps and output scores.",
333    )
334    current_config_id: ID_TYPE = Field(
335        default=None,
336        description="The id of the current config to use for this eval. This can be changed over time to run the same eval with different configs.",
337    )
338    eval_set_filter_id: DatasetFilterId = Field(
339        description="The id of the dataset filter which defines which dataset items are included when running this eval. Should be mutually exclusive with eval_configs_filter_id and train_set_filter_id."
340    )
341    eval_configs_filter_id: DatasetFilterId | None = Field(
342        default=None,
343        description="The id of the dataset filter which defines which dataset items are included when comparing the quality of the eval configs under this eval. Should consist of dataset items with ratings. Should be mutually exclusive with eval_set_filter_id.",
344    )
345    train_set_filter_id: DatasetFilterId | None = Field(
346        default=None,
347        description="The id of the dataset filter which defines which dataset items are included in the training set for fine-tuning. Should be mutually exclusive with eval_set_filter_id.",
348    )
349    output_scores: List[EvalOutputScore] = Field(
350        description="The scores this evaluator should produce."
351    )
352    favourite: bool = Field(
353        default=False,
354        description="Whether this eval is a favourite of the user. Rendered as a star icon in the UI.",
355    )
356    template_properties: dict[str, str | int | bool | float] | None = Field(
357        default=None,
358        description="Properties to be used to execute the eval. This is template_type specific and should serialize to a json dict.",
359    )
360    evaluation_data_type: EvalDataType = Field(
361        default=EvalDataType.final_answer,
362        description="The output of the task run to evaluate. Can be final answer or full trace.",
363    )
364
365    # Workaround to return typed parent without importing Task
366    def parent_task(self) -> Union["Task", None]:
367        if self.parent is not None and self.parent.__class__.__name__ != "Task":
368            raise ValueError("parent must be a Task")
369        return self.parent  # type: ignore
370
371    def configs(self, readonly: bool = False) -> list[EvalConfig]:
372        return super().configs(readonly=readonly)  # type: ignore
373
374    # Workaround to return typed parent without importing Spec
375    def associated_spec(self, readonly: bool = False) -> Union["Spec", None]:
376        """
377        Get the spec associated with this eval, if any.
378        Returns None for legacy evals that are not associated with a spec.
379        """
380
381        task = self.parent_task()
382        if not task or not self.id:
383            return None
384
385        specs = task.specs(readonly=readonly)
386        for spec in specs:
387            if spec.eval_id == self.id:
388                return spec
389        return None
390
391    @model_validator(mode="after")
392    def upgrade_old_reference_answer_eval_config(self) -> Self:
393        """
394        Migration: Set the first judge config as the default for existing reference answer evals that don't have a current_config_id set.
395
396        For reference_answer evals that don't have a current_config_id set, this migration
397        will set the first config (by created_at) as the default.
398        """
399        if self.id is None:
400            return self
401
402        # Only run during file loading
403        if not self._loaded_from_file:
404            return self
405
406        # Skip if already migrated (has a current_config_id set)
407        if self.current_config_id is not None:
408            return self
409
410        # Only migrate reference_answer evals
411        if self.evaluation_data_type != EvalDataType.reference_answer:
412            return self
413
414        # Prevent recursion: self.configs() loads child files, which re-loads this parent
415        # (see basemodel.py where we iterate_children_paths_of_parent_path calls load_from_file)
416        # This causes the validator to run again, creating an infinite loop without this guard.
417        with _migration_lock:
418            if self.id in _currently_migrating_eval_ids:
419                return self
420            _currently_migrating_eval_ids.add(self.id)
421
422        try:
423            # Get the configs - these are loaded from child files
424            configs_list = self.configs(readonly=True)
425            if configs_list and len(configs_list) > 0:
426                # Sort by created_at to get the oldest (first created) config
427                sorted_configs = sorted(configs_list, key=lambda c: c.created_at)
428                self.current_config_id = sorted_configs[0].id
429        finally:
430            with _migration_lock:
431                _currently_migrating_eval_ids.discard(self.id)
432
433        return self
434
435    @model_validator(mode="after")
436    def migrate_train_set_filter_id(self) -> Self:
437        """
438        Migration: Auto-create a train_set_filter_id for legacy evals that don't have one.
439
440        Generates a tag-based filter ID from the eval name following the convention
441        used by spec-based evals (e.g., "train_{name_slug}").
442        """
443        if self.id is None:
444            return self
445
446        if not self._loaded_from_file:
447            return self
448
449        if self.train_set_filter_id is not None:
450            return self
451
452        tag_suffix = self.name.lower().replace(" ", "_")
453        self.train_set_filter_id = f"tag::train_{tag_suffix}"
454        return self
455
456    @model_validator(mode="after")
457    def validate_scores(self) -> Self:
458        if self.output_scores is None or len(self.output_scores) == 0:
459            raise ValueError(
460                "output_scores are required, and must have at least one score."
461            )
462
463        # check for duplicate names (once transformed to JSON keys)
464        output_score_keys = [score.json_key() for score in self.output_scores]
465        if len(output_score_keys) != len(set(output_score_keys)):
466            raise ValueError(
467                f"output_scores must have unique names (once transformed to JSON keys). Got: [{', '.join(output_score_keys)}]"
468            )
469        return self
470
471    @model_validator(mode="after")
472    def validate_template_properties(self) -> Self:
473        # eval_configs_filter_id is required for all templates except "rag"
474        if (
475            self.template is not EvalTemplateId.rag
476            and self.eval_configs_filter_id is None
477        ):
478            raise ValueError(
479                "eval_configs_filter_id is required for all templates except 'rag'"
480            )
481
482        # For spec-based evals, template_properties will be None and validation happens in the spec
483        # For legacy evals, template_properties contains the data and we validate here
484        if self.template_properties is None:
485            return self
486
487        # Check for properties that are required for the issue template (legacy evals only)
488        if self.template == EvalTemplateId.issue:
489            if "issue_prompt" not in self.template_properties or not isinstance(
490                self.template_properties["issue_prompt"], str
491            ):
492                raise ValueError("issue_prompt is required for issue template")
493            if "failure_example" in self.template_properties and not isinstance(
494                self.template_properties["failure_example"], str
495            ):
496                raise ValueError(
497                    "failure_example is optional for issue template, but if provided must be a string"
498                )
499            if "pass_example" in self.template_properties and not isinstance(
500                self.template_properties["pass_example"], str
501            ):
502                raise ValueError(
503                    "pass_example is optional for issue template, but if provided must be a string"
504                )
505
506        if self.template == EvalTemplateId.tool_call:
507            if self.evaluation_data_type != EvalDataType.full_trace:
508                raise ValueError(
509                    "tool_call template should have evaluation_data_type set to full_trace"
510                )
511            if (
512                "tool" not in self.template_properties
513                or not isinstance(self.template_properties["tool"], str)
514                or not self.template_properties["tool"].strip()
515            ):
516                raise ValueError("tool is required for tool call template")
517            if "tool_function_name" not in self.template_properties or not isinstance(
518                self.template_properties["tool_function_name"], str
519            ):
520                raise ValueError(
521                    "tool_function_name is required for tool call template"
522                )
523            if (
524                "appropriate_tool_use_guidelines" not in self.template_properties
525                or not isinstance(
526                    self.template_properties["appropriate_tool_use_guidelines"], str
527                )
528                or not self.template_properties[
529                    "appropriate_tool_use_guidelines"
530                ].strip()
531            ):
532                raise ValueError(
533                    "appropriate_tool_use_guidelines is required for tool call template"
534                )
535            if (
536                "inappropriate_tool_use_guidelines" in self.template_properties
537                and not isinstance(
538                    self.template_properties["inappropriate_tool_use_guidelines"], str
539                )
540            ):
541                raise ValueError(
542                    "inappropriate_tool_use_guidelines is optional for tool call template, but if provided must be a string"
543                )
544        return self

Base model for Kiln models that have a parent-child relationship. This base class is for child models.

This class provides functionality for managing hierarchical relationships between models, including parent reference handling and file system organization.

Attributes: parent (KilnBaseModel): Reference to the parent model instance. Not persisted, just in memory.

name: Annotated[str, BeforeValidator(func=<function name_validator.<locals>.fn at 0x7f5e59c4d300>, json_schema_input_type=PydanticUndefined)]
description: str | None
template: EvalTemplateId | None
current_config_id: Optional[str]
eval_set_filter_id: Annotated[str, AfterValidator(func=<function <lambda> at 0x7f5e58d51580>)]
eval_configs_filter_id: Optional[Annotated[str, AfterValidator(func=<function <lambda> at 0x7f5e58d51580>)]]
train_set_filter_id: Optional[Annotated[str, AfterValidator(func=<function <lambda> at 0x7f5e58d51580>)]]
output_scores: List[EvalOutputScore]
favourite: bool
template_properties: dict[str, str | int | bool | float] | None
evaluation_data_type: EvalDataType
def parent_task(self) -> Optional[kiln_ai.datamodel.Task]:
366    def parent_task(self) -> Union["Task", None]:
367        if self.parent is not None and self.parent.__class__.__name__ != "Task":
368            raise ValueError("parent must be a Task")
369        return self.parent  # type: ignore
def configs(self, readonly=False) -> List[EvalConfig]:
709        def child_method(self, readonly: bool = False) -> list[child_class]:  # type: ignore[invalid-type-form]
710            return child_class.all_children_of_parent_path(self.path, readonly=readonly)

The type of the None singleton.

def associated_spec(self, readonly: bool = False) -> Optional[kiln_ai.datamodel.spec.Spec]:
375    def associated_spec(self, readonly: bool = False) -> Union["Spec", None]:
376        """
377        Get the spec associated with this eval, if any.
378        Returns None for legacy evals that are not associated with a spec.
379        """
380
381        task = self.parent_task()
382        if not task or not self.id:
383            return None
384
385        specs = task.specs(readonly=readonly)
386        for spec in specs:
387            if spec.eval_id == self.id:
388                return spec
389        return None

Get the spec associated with this eval, if any. Returns None for legacy evals that are not associated with a spec.

@model_validator(mode='after')
def upgrade_old_reference_answer_eval_config(self) -> Self:
391    @model_validator(mode="after")
392    def upgrade_old_reference_answer_eval_config(self) -> Self:
393        """
394        Migration: Set the first judge config as the default for existing reference answer evals that don't have a current_config_id set.
395
396        For reference_answer evals that don't have a current_config_id set, this migration
397        will set the first config (by created_at) as the default.
398        """
399        if self.id is None:
400            return self
401
402        # Only run during file loading
403        if not self._loaded_from_file:
404            return self
405
406        # Skip if already migrated (has a current_config_id set)
407        if self.current_config_id is not None:
408            return self
409
410        # Only migrate reference_answer evals
411        if self.evaluation_data_type != EvalDataType.reference_answer:
412            return self
413
414        # Prevent recursion: self.configs() loads child files, which re-loads this parent
415        # (see basemodel.py where we iterate_children_paths_of_parent_path calls load_from_file)
416        # This causes the validator to run again, creating an infinite loop without this guard.
417        with _migration_lock:
418            if self.id in _currently_migrating_eval_ids:
419                return self
420            _currently_migrating_eval_ids.add(self.id)
421
422        try:
423            # Get the configs - these are loaded from child files
424            configs_list = self.configs(readonly=True)
425            if configs_list and len(configs_list) > 0:
426                # Sort by created_at to get the oldest (first created) config
427                sorted_configs = sorted(configs_list, key=lambda c: c.created_at)
428                self.current_config_id = sorted_configs[0].id
429        finally:
430            with _migration_lock:
431                _currently_migrating_eval_ids.discard(self.id)
432
433        return self

Migration: Set the first judge config as the default for existing reference answer evals that don't have a current_config_id set.

For reference_answer evals that don't have a current_config_id set, this migration will set the first config (by created_at) as the default.

@model_validator(mode='after')
def migrate_train_set_filter_id(self) -> Self:
435    @model_validator(mode="after")
436    def migrate_train_set_filter_id(self) -> Self:
437        """
438        Migration: Auto-create a train_set_filter_id for legacy evals that don't have one.
439
440        Generates a tag-based filter ID from the eval name following the convention
441        used by spec-based evals (e.g., "train_{name_slug}").
442        """
443        if self.id is None:
444            return self
445
446        if not self._loaded_from_file:
447            return self
448
449        if self.train_set_filter_id is not None:
450            return self
451
452        tag_suffix = self.name.lower().replace(" ", "_")
453        self.train_set_filter_id = f"tag::train_{tag_suffix}"
454        return self

Migration: Auto-create a train_set_filter_id for legacy evals that don't have one.

Generates a tag-based filter ID from the eval name following the convention used by spec-based evals (e.g., "train_{name_slug}").

@model_validator(mode='after')
def validate_scores(self) -> Self:
456    @model_validator(mode="after")
457    def validate_scores(self) -> Self:
458        if self.output_scores is None or len(self.output_scores) == 0:
459            raise ValueError(
460                "output_scores are required, and must have at least one score."
461            )
462
463        # check for duplicate names (once transformed to JSON keys)
464        output_score_keys = [score.json_key() for score in self.output_scores]
465        if len(output_score_keys) != len(set(output_score_keys)):
466            raise ValueError(
467                f"output_scores must have unique names (once transformed to JSON keys). Got: [{', '.join(output_score_keys)}]"
468            )
469        return self
@model_validator(mode='after')
def validate_template_properties(self) -> Self:
471    @model_validator(mode="after")
472    def validate_template_properties(self) -> Self:
473        # eval_configs_filter_id is required for all templates except "rag"
474        if (
475            self.template is not EvalTemplateId.rag
476            and self.eval_configs_filter_id is None
477        ):
478            raise ValueError(
479                "eval_configs_filter_id is required for all templates except 'rag'"
480            )
481
482        # For spec-based evals, template_properties will be None and validation happens in the spec
483        # For legacy evals, template_properties contains the data and we validate here
484        if self.template_properties is None:
485            return self
486
487        # Check for properties that are required for the issue template (legacy evals only)
488        if self.template == EvalTemplateId.issue:
489            if "issue_prompt" not in self.template_properties or not isinstance(
490                self.template_properties["issue_prompt"], str
491            ):
492                raise ValueError("issue_prompt is required for issue template")
493            if "failure_example" in self.template_properties and not isinstance(
494                self.template_properties["failure_example"], str
495            ):
496                raise ValueError(
497                    "failure_example is optional for issue template, but if provided must be a string"
498                )
499            if "pass_example" in self.template_properties and not isinstance(
500                self.template_properties["pass_example"], str
501            ):
502                raise ValueError(
503                    "pass_example is optional for issue template, but if provided must be a string"
504                )
505
506        if self.template == EvalTemplateId.tool_call:
507            if self.evaluation_data_type != EvalDataType.full_trace:
508                raise ValueError(
509                    "tool_call template should have evaluation_data_type set to full_trace"
510                )
511            if (
512                "tool" not in self.template_properties
513                or not isinstance(self.template_properties["tool"], str)
514                or not self.template_properties["tool"].strip()
515            ):
516                raise ValueError("tool is required for tool call template")
517            if "tool_function_name" not in self.template_properties or not isinstance(
518                self.template_properties["tool_function_name"], str
519            ):
520                raise ValueError(
521                    "tool_function_name is required for tool call template"
522                )
523            if (
524                "appropriate_tool_use_guidelines" not in self.template_properties
525                or not isinstance(
526                    self.template_properties["appropriate_tool_use_guidelines"], str
527                )
528                or not self.template_properties[
529                    "appropriate_tool_use_guidelines"
530                ].strip()
531            ):
532                raise ValueError(
533                    "appropriate_tool_use_guidelines is required for tool call template"
534                )
535            if (
536                "inappropriate_tool_use_guidelines" in self.template_properties
537                and not isinstance(
538                    self.template_properties["inappropriate_tool_use_guidelines"], str
539                )
540            ):
541                raise ValueError(
542                    "inappropriate_tool_use_guidelines is optional for tool call template, but if provided must be a string"
543                )
544        return self
def relationship_name() -> str:
727        def relationship_name_method() -> str:
728            return relationship_name

The type of the None singleton.

def parent_type() -> Type[kiln_ai.datamodel.basemodel.KilnParentModel]:
720        def parent_class_method() -> Type[KilnParentModel]:
721            return cls

The type of the None singleton.

model_config = {'validate_assignment': True}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

def model_post_init(self: pydantic.main.BaseModel, context: Any, /) -> None:
337def init_private_attributes(self: BaseModel, context: Any, /) -> None:
338    """This function is meant to behave like a BaseModel method to initialise private attributes.
339
340    It takes context as an argument since that's what pydantic-core passes when calling it.
341
342    Args:
343        self: The BaseModel instance.
344        context: The context.
345    """
346    if getattr(self, '__pydantic_private__', None) is None:
347        pydantic_private = {}
348        for name, private_attr in self.__private_attributes__.items():
349            default = private_attr.get_default()
350            if default is not PydanticUndefined:
351                pydantic_private[name] = default
352        object_setattr(self, '__pydantic_private__', pydantic_private)

This function is meant to behave like a BaseModel method to initialise private attributes.

It takes context as an argument since that's what pydantic-core passes when calling it.

Args: self: The BaseModel instance. context: The context.