kiln_ai.datamodel

See our docs for details about our datamodel: https://kiln-ai.github.io/Kiln/kiln_core_docs/kiln_ai.html

  1"""
  2See our docs for details about our datamodel: https://kiln-ai.github.io/Kiln/kiln_core_docs/kiln_ai.html
  3"""
  4
  5from __future__ import annotations
  6
  7import json
  8import math
  9import random
 10from enum import Enum, IntEnum
 11from typing import TYPE_CHECKING, Callable, Dict, List, Type, Union
 12
 13import jsonschema
 14import jsonschema.exceptions
 15from pydantic import (
 16    BaseModel,
 17    Field,
 18    ValidationInfo,
 19    model_validator,
 20)
 21from typing_extensions import Self
 22
 23from kiln_ai.datamodel.json_schema import JsonObjectSchema, schema_from_json_str
 24
 25from .basemodel import (
 26    ID_FIELD,
 27    ID_TYPE,
 28    NAME_FIELD,
 29    SHORT_NAME_FIELD,
 30    KilnBaseModel,
 31    KilnParentedModel,
 32    KilnParentModel,
 33)
 34from .json_schema import validate_schema
 35
 36if TYPE_CHECKING:
 37    from . import Task
 38
 39
 40__all__ = [
 41    "basemodel",
 42    "json_schema",
 43    "Task",
 44    "Project",
 45    "TaskRun",
 46    "TaskOutput",
 47    "TaskOutputRating",
 48    "Priority",
 49    "DataSource",
 50    "DataSourceType",
 51    "DataSourceProperty",
 52    "Finetune",
 53    "FineTuneStatusType",
 54    "TaskOutputRatingType",
 55    "TaskRequirement",
 56    "TaskDeterminism",
 57    "DatasetSplitDefinition",
 58    "DatasetSplit",
 59    "RequirementRating",
 60    "TaskRequirement",
 61    "strict_mode",
 62    "set_strict_mode",
 63]
 64
 65
 66# We want to be hard on ourselves for data completeness generated by the Kiln App, but don't want to make it hard for users to use the datamodel/library.
 67# Strict mode enables extra validations that we want to enforce in Kiln App (and any other client that wants best practices), but not in the library (unless they opt in)
 68_strict_mode: bool = False
 69
 70
 71def strict_mode() -> bool:
 72    return _strict_mode
 73
 74
 75def set_strict_mode(value: bool) -> None:
 76    global _strict_mode
 77    _strict_mode = value
 78
 79
 80class Priority(IntEnum):
 81    """Defines priority levels for tasks and requirements, where P0 is highest priority."""
 82
 83    p0 = 0
 84    p1 = 1
 85    p2 = 2
 86    p3 = 3
 87
 88
 89# Only one rating type for now, but this allows for extensibility if we want to add more in the future
 90class TaskOutputRatingType(str, Enum):
 91    """Defines the types of rating systems available for task outputs."""
 92
 93    five_star = "five_star"
 94    pass_fail = "pass_fail"
 95    pass_fail_critical = "pass_fail_critical"
 96    custom = "custom"
 97
 98
 99class RequirementRating(BaseModel):
100    """Rating for a specific requirement within a task output."""
101
102    value: float = Field(
103        description="The rating value. Interpretation depends on rating type"
104    )
105    type: TaskOutputRatingType = Field(description="The type of rating")
106
107
108class TaskOutputRating(KilnBaseModel):
109    """
110    A rating for a task output, including an overall rating and ratings for each requirement.
111
112    Supports:
113    - five_star: 1-5 star ratings
114    - pass_fail: boolean pass/fail (1.0 = pass, 0.0 = fail)
115    - pass_fail_critical: tri-state (1.0 = pass, 0.0 = fail, -1.0 = critical fail)
116    """
117
118    type: TaskOutputRatingType = Field(default=TaskOutputRatingType.five_star)
119    value: float | None = Field(
120        description="The rating value. Interpretation depends on rating type:\n- five_star: 1-5 stars\n- pass_fail: 1.0 (pass) or 0.0 (fail)\n- pass_fail_critical: 1.0 (pass), 0.0 (fail), or -1.0 (critical fail)",
121        default=None,
122    )
123    requirement_ratings: Dict[ID_TYPE, RequirementRating] = Field(
124        default={},
125        description="The ratings of the requirements of the task.",
126    )
127
128    # Previously we stored rating values as a dict of floats, but now we store them as RequirementRating objects.
129    @model_validator(mode="before")
130    def upgrade_old_format(cls, data: dict) -> dict:
131        if not isinstance(data, dict):
132            return data
133
134        # Check if we have the old format (dict of floats)
135        req_ratings = data.get("requirement_ratings", {})
136        if req_ratings and all(
137            isinstance(v, (int, float)) for v in req_ratings.values()
138        ):
139            # Convert each float to a RequirementRating object
140            # all ratings are five star at the point we used this format
141            data["requirement_ratings"] = {
142                k: {"value": v, "type": TaskOutputRatingType.five_star}
143                for k, v in req_ratings.items()
144            }
145
146        return data
147
148    # Used to select high quality outputs for example selection (MultiShotPromptBuilder, etc)
149    def is_high_quality(self) -> bool:
150        if self.value is None:
151            return False
152
153        if self.type == TaskOutputRatingType.five_star:
154            return self.value >= 4
155        elif self.type == TaskOutputRatingType.pass_fail:
156            return self.value == 1.0
157        elif self.type == TaskOutputRatingType.pass_fail_critical:
158            return self.value == 1.0
159        return False
160
161    @model_validator(mode="after")
162    def validate_rating(self) -> Self:
163        if self.type not in TaskOutputRatingType:
164            raise ValueError(f"Invalid rating type: {self.type}")
165
166        # Overall rating is optional
167        if self.value is not None:
168            self._validate_rating(self.type, self.value, "overall rating")
169
170        for req_id, req_rating in self.requirement_ratings.items():
171            self._validate_rating(
172                req_rating.type,
173                req_rating.value,
174                f"requirement rating for req ID: {req_id}",
175            )
176
177        return self
178
179    def _validate_rating(
180        self, type: TaskOutputRatingType, rating: float | None, rating_name: str
181    ) -> None:
182        if type == TaskOutputRatingType.five_star:
183            self._validate_five_star(rating, rating_name)
184        elif type == TaskOutputRatingType.pass_fail:
185            self._validate_pass_fail(rating, rating_name)
186        elif type == TaskOutputRatingType.pass_fail_critical:
187            self._validate_pass_fail_critical(rating, rating_name)
188
189    def _validate_five_star(self, rating: float | None, rating_name: str) -> None:
190        if rating is None or not isinstance(rating, float) or not rating.is_integer():
191            raise ValueError(
192                f"{rating_name.capitalize()} of type five_star must be an integer value (1-5)"
193            )
194        if rating < 1 or rating > 5:
195            raise ValueError(
196                f"{rating_name.capitalize()} of type five_star must be between 1 and 5 stars"
197            )
198
199    def _validate_pass_fail(self, rating: float | None, rating_name: str) -> None:
200        if rating is None or not isinstance(rating, float) or not rating.is_integer():
201            raise ValueError(
202                f"{rating_name.capitalize()} of type pass_fail must be an integer value (0 or 1)"
203            )
204        if rating not in [0, 1]:
205            raise ValueError(
206                f"{rating_name.capitalize()} of type pass_fail must be 0 (fail) or 1 (pass)"
207            )
208
209    def _validate_pass_fail_critical(
210        self, rating: float | None, rating_name: str
211    ) -> None:
212        if rating is None or not isinstance(rating, float) or not rating.is_integer():
213            raise ValueError(
214                f"{rating_name.capitalize()} of type pass_fail_critical must be an integer value (-1, 0, or 1)"
215            )
216        if rating not in [-1, 0, 1]:
217            raise ValueError(
218                f"{rating_name.capitalize()} of type pass_fail_critical must be -1 (critical fail), 0 (fail), or 1 (pass)"
219            )
220
221
222class TaskOutput(KilnBaseModel):
223    """
224    An output for a specific task run.
225
226    Contains the actual output content, its source (human or synthetic),
227    and optional rating information.
228    """
229
230    output: str = Field(
231        description="The output of the task. JSON formatted for structured output, plaintext for unstructured output."
232    )
233    source: DataSource | None = Field(
234        description="The source of the output: human or synthetic.",
235        default=None,
236    )
237    rating: TaskOutputRating | None = Field(
238        default=None, description="The rating of the output"
239    )
240
241    def validate_output_format(self, task: Task) -> Self:
242        # validate output
243        if task.output_json_schema is not None:
244            try:
245                validate_schema(json.loads(self.output), task.output_json_schema)
246            except json.JSONDecodeError:
247                raise ValueError("Output is not a valid JSON object")
248            except jsonschema.exceptions.ValidationError as e:
249                raise ValueError(f"Output does not match task output schema: {e}")
250        return self
251
252    @model_validator(mode="after")
253    def validate_output_source(self, info: ValidationInfo) -> Self:
254        # On strict mode and not loaded from file, we validate output_source is not None.
255        # We want to be able to load any data, even if it's not perfect. But we want to create perfect data when adding new data.
256        if not strict_mode():
257            return self
258        if self.loaded_from_file(info):
259            return self
260        if self.source is None:
261            raise ValueError("Output source is required when strict mode is enabled")
262        return self
263
264
265class FineTuneStatusType(str, Enum):
266    """
267    The status type of a fine-tune (running, completed, failed, etc).
268    """
269
270    unknown = "unknown"  # server error
271    pending = "pending"
272    running = "running"
273    completed = "completed"
274    failed = "failed"
275
276
277class Finetune(KilnParentedModel):
278    name: str = NAME_FIELD
279    description: str | None = Field(
280        default=None,
281        description="A description of the fine-tune for you and your team. Not used in training.",
282    )
283    provider: str = Field(
284        description="The provider to use for the fine-tune (e.g. 'openai')."
285    )
286    base_model_id: str = Field(
287        description="The id of the base model to use for the fine-tune. This string relates to the provider's IDs for their own models, not Kiln IDs."
288    )
289    provider_id: str | None = Field(
290        default=None,
291        description="The ID of the fine-tune job on the provider's side. May not be the same as the fine_tune_model_id.",
292    )
293    fine_tune_model_id: str | None = Field(
294        default=None,
295        description="The ID of the fine-tuned model on the provider's side. May not be the same as the provider_id.",
296    )
297    dataset_split_id: str = Field(
298        description="The ID of the dataset split to use for this fine-tune.",
299    )
300    train_split_name: str = Field(
301        default="train",
302        description="The name of the training split to use for this fine-tune.",
303    )
304    validation_split_name: str | None = Field(
305        default=None,
306        description="The name of the validation split to use for this fine-tune. Optional.",
307    )
308    parameters: dict[str, str | int | float | bool] = Field(
309        default={},
310        description="The parameters to use for this fine-tune. These are provider-specific.",
311    )
312    system_message: str = Field(
313        description="The system message to use for this fine-tune.",
314    )
315    latest_status: FineTuneStatusType = Field(
316        default=FineTuneStatusType.unknown,
317        description="The latest known status of this fine-tune. Not updated in real time.",
318    )
319    properties: Dict[str, str | int | float] = Field(
320        default={},
321        description="Properties of the fine-tune. Different providers may use different properties.",
322    )
323
324    def parent_task(self) -> Task | None:
325        if not isinstance(self.parent, Task):
326            return None
327        return self.parent
328
329
330class DataSourceType(str, Enum):
331    """
332    The source type of a piece of data.
333
334    Human: a human created the data
335    Synthetic: a model created the data
336    """
337
338    human = "human"
339    synthetic = "synthetic"
340
341
342class DataSourceProperty(BaseModel):
343    """
344    Defines a property that can be associated with a data source.
345
346    Includes validation rules for when properties are required or not allowed
347    based on the data source type.
348    """
349
350    name: str
351    type: Type[Union[str, int, float]]
352    required_for: List[DataSourceType] = []
353    not_allowed_for: List[DataSourceType] = []
354
355
356class DataSource(BaseModel):
357    """
358    Represents the origin of data, either human or synthetic, with associated properties.
359
360    Properties vary based on the source type - for synthetic sources this includes
361    model information, for human sources this includes creator information.
362    """
363
364    type: DataSourceType
365    properties: Dict[str, str | int | float] = Field(
366        default={},
367        description="Properties describing the data source. For synthetic things like model. For human, the human's name.",
368    )
369
370    _data_source_properties = [
371        DataSourceProperty(
372            name="created_by",
373            type=str,
374            required_for=[DataSourceType.human],
375            not_allowed_for=[DataSourceType.synthetic],
376        ),
377        DataSourceProperty(
378            name="model_name",
379            type=str,
380            required_for=[DataSourceType.synthetic],
381            not_allowed_for=[DataSourceType.human],
382        ),
383        DataSourceProperty(
384            name="model_provider",
385            type=str,
386            required_for=[DataSourceType.synthetic],
387            not_allowed_for=[DataSourceType.human],
388        ),
389        DataSourceProperty(
390            name="adapter_name",
391            type=str,
392            required_for=[DataSourceType.synthetic],
393            not_allowed_for=[DataSourceType.human],
394        ),
395        DataSourceProperty(
396            name="prompt_builder_name",
397            type=str,
398            not_allowed_for=[DataSourceType.human],
399        ),
400    ]
401
402    @model_validator(mode="after")
403    def validate_type(self) -> "DataSource":
404        if self.type not in DataSourceType:
405            raise ValueError(f"Invalid data source type: {self.type}")
406        return self
407
408    @model_validator(mode="after")
409    def validate_properties(self) -> "DataSource":
410        for prop in self._data_source_properties:
411            # Check the property type is correct
412            if prop.name in self.properties:
413                if not isinstance(self.properties[prop.name], prop.type):
414                    raise ValueError(
415                        f"'{prop.name}' must be of type {prop.type.__name__} for {self.type} data source"
416                    )
417            # Check the property is required for the data source type
418            if self.type in prop.required_for:
419                if prop.name not in self.properties:
420                    raise ValueError(
421                        f"'{prop.name}' is required for {self.type} data source"
422                    )
423            # Check the property is not allowed for the data source type
424            elif self.type in prop.not_allowed_for and prop.name in self.properties:
425                raise ValueError(
426                    f"'{prop.name}' is not allowed for {self.type} data source"
427                )
428        return self
429
430    @model_validator(mode="after")
431    def validate_no_empty_properties(self) -> Self:
432        for prop, value in self.properties.items():
433            if isinstance(value, str) and value == "":
434                raise ValueError(
435                    f"Property '{prop}' must be a non-empty string for {self.type} data source"
436                )
437        return self
438
439
440class TaskRun(KilnParentedModel):
441    """
442    Represents a single execution of a Task.
443
444    Contains the input used, its source, the output produced, and optional
445    repair information if the output needed correction.
446    """
447
448    input: str = Field(
449        description="The inputs to the task. JSON formatted for structured input, plaintext for unstructured input."
450    )
451    input_source: DataSource | None = Field(
452        default=None, description="The source of the input: human or synthetic."
453    )
454
455    output: TaskOutput = Field(description="The output of the task run.")
456    repair_instructions: str | None = Field(
457        default=None,
458        description="Instructions for fixing the output. Should define what is wrong, and how to fix it. Will be used by models for both generating a fixed output, and evaluating future models.",
459    )
460    repaired_output: TaskOutput | None = Field(
461        default=None,
462        description="An version of the output with issues fixed. This must be a 'fixed' version of the existing output, and not an entirely new output. If you wish to generate an ideal curatorial output for this task unrelated to this output, generate a new TaskOutput with type 'human' instead of using this field.",
463    )
464    intermediate_outputs: Dict[str, str] | None = Field(
465        default=None,
466        description="Intermediate outputs from the task run. Keys are the names of the intermediate output steps (cot=chain of thought, etc), values are the output data.",
467    )
468    tags: List[str] = Field(
469        default=[],
470        description="Tags for the task run. Tags are used to categorize task runs for filtering and reporting.",
471    )
472
473    def parent_task(self) -> Task | None:
474        if not isinstance(self.parent, Task):
475            return None
476        return self.parent
477
478    @model_validator(mode="after")
479    def validate_input_format(self) -> Self:
480        task = self.parent_task()
481        if task is None:
482            # don't validate this relationship until we have a path or parent. Give them time to build it (but will catch it before saving)
483            return self
484
485        # validate output
486        if task.input_json_schema is not None:
487            try:
488                validate_schema(json.loads(self.input), task.input_json_schema)
489            except json.JSONDecodeError:
490                raise ValueError("Input is not a valid JSON object")
491            except jsonschema.exceptions.ValidationError as e:
492                raise ValueError(f"Input does not match task input schema: {e}")
493        return self
494
495    @model_validator(mode="after")
496    def validate_output_format(self) -> Self:
497        task = self.parent_task()
498        if task is None:
499            return self
500
501        self.output.validate_output_format(task)
502        return self
503
504    @model_validator(mode="after")
505    def validate_repaired_output(self) -> Self:
506        if self.repaired_output is not None:
507            if self.repaired_output.rating is not None:
508                raise ValueError(
509                    "Repaired output rating must be None. Repaired outputs are assumed to have a perfect rating, as they have been fixed."
510                )
511        if self.repair_instructions is None and self.repaired_output is not None:
512            raise ValueError(
513                "Repair instructions are required if providing a repaired output."
514            )
515        if self.repair_instructions is not None and self.repaired_output is None:
516            raise ValueError(
517                "A repaired output is required if providing repair instructions."
518            )
519        return self
520
521    @model_validator(mode="after")
522    def validate_input_source(self, info: ValidationInfo) -> Self:
523        # On strict mode and not loaded from file, we validate input_source is not None.
524        # We want to be able to load any data, even if it's not perfect. But we want to create perfect data when adding new data.
525        if not strict_mode():
526            return self
527        if self.loaded_from_file(info):
528            return self
529        if self.input_source is None:
530            raise ValueError("input_source is required when strict mode is enabled")
531        return self
532
533    @model_validator(mode="after")
534    def validate_tags(self) -> Self:
535        for tag in self.tags:
536            if not tag:
537                raise ValueError("Tags cannot be empty strings")
538            if " " in tag:
539                raise ValueError("Tags cannot contain spaces. Try underscores.")
540
541        return self
542
543
544# Define the type alias for clarity
545DatasetFilter = Callable[[TaskRun], bool]
546
547
548def AllDatasetFilter(_: TaskRun) -> bool:
549    return True
550
551
552def HighRatingDatasetFilter(task_run: TaskRun) -> bool:
553    if task_run.output is None or task_run.output.rating is None:
554        return False
555    return task_run.output.rating.is_high_quality()
556
557
558class DatasetSplitDefinition(BaseModel):
559    """
560    A definition of a split in a dataset.
561
562    Example: name="train", description="The training set", percentage=0.8 (80% of the dataset)
563    """
564
565    name: str = NAME_FIELD
566    description: str | None = Field(
567        default=None,
568        description="A description of the dataset for you and your team. Not used in training.",
569    )
570    percentage: float = Field(
571        ge=0.0,
572        le=1.0,
573        description="The percentage of the dataset that this split represents (between 0 and 1).",
574    )
575
576
577AllSplitDefinition: list[DatasetSplitDefinition] = [
578    DatasetSplitDefinition(name="all", percentage=1.0)
579]
580Train80Test20SplitDefinition: list[DatasetSplitDefinition] = [
581    DatasetSplitDefinition(name="train", percentage=0.8),
582    DatasetSplitDefinition(name="test", percentage=0.2),
583]
584Train60Test20Val20SplitDefinition: list[DatasetSplitDefinition] = [
585    DatasetSplitDefinition(name="train", percentage=0.6),
586    DatasetSplitDefinition(name="test", percentage=0.2),
587    DatasetSplitDefinition(name="val", percentage=0.2),
588]
589
590
591class DatasetSplit(KilnParentedModel):
592    """
593    A collection of task runs, with optional splits (train, test, validation).
594
595    Used to freeze a dataset into train/test/validation splits for repeatable fine-tuning or other tasks.
596
597    Maintains a list of IDs for each split, to avoid data duplication.
598    """
599
600    name: str = NAME_FIELD
601    description: str | None = Field(
602        default=None,
603        description="A description of the dataset for you and your team. Not used in training.",
604    )
605    splits: list[DatasetSplitDefinition] = Field(
606        default_factory=list,
607        description="The splits in the dataset.",
608    )
609    split_contents: dict[str, list[str]] = Field(
610        description="The contents of each split in the dataset. The key is the split name, and the value is a list of task run IDs.",
611    )
612
613    @model_validator(mode="after")
614    def validate_split_percentages(self) -> "DatasetSplit":
615        total = sum(split.percentage for split in self.splits)
616        if not math.isclose(total, 1.0, rel_tol=1e-9):
617            raise ValueError(f"The sum of split percentages must be 1.0 (got {total})")
618        return self
619
620    @classmethod
621    def from_task(
622        cls,
623        name: str,
624        task: "Task",
625        splits: list[DatasetSplitDefinition],
626        filter: DatasetFilter = AllDatasetFilter,
627        description: str | None = None,
628    ):
629        """
630        Build a dataset split from a task.
631        """
632        split_contents = cls.build_split_contents(task, splits, filter)
633        return cls(
634            parent=task,
635            name=name,
636            description=description,
637            splits=splits,
638            split_contents=split_contents,
639        )
640
641    @classmethod
642    def build_split_contents(
643        cls,
644        task: "Task",
645        splits: list[DatasetSplitDefinition],
646        filter: DatasetFilter,
647    ) -> dict[str, list[str]]:
648        valid_ids = []
649        for task_run in task.runs():
650            if filter(task_run):
651                valid_ids.append(task_run.id)
652
653        # Shuffle and split by split percentage
654        random.shuffle(valid_ids)
655        split_contents = {}
656        start_idx = 0
657        remaining_items = len(valid_ids)
658
659        # Handle all splits except the last one
660        for split in splits[:-1]:
661            split_size = round(len(valid_ids) * split.percentage)
662            split_contents[split.name] = valid_ids[start_idx : start_idx + split_size]
663            start_idx += split_size
664            remaining_items -= split_size
665
666        # Last split gets all remaining items (for rounding)
667        if splits:
668            split_contents[splits[-1].name] = valid_ids[start_idx:]
669
670        return split_contents
671
672    def parent_task(self) -> "Task | None":
673        # inline import to avoid circular import
674        from kiln_ai.datamodel import Task
675
676        if not isinstance(self.parent, Task):
677            return None
678        return self.parent
679
680    def missing_count(self) -> int:
681        """
682        Returns:
683            int: the number of task runs that have an ID persisted in this dataset split, but no longer exist in the dataset
684        """
685        parent = self.parent_task()
686        if parent is None:
687            raise ValueError("DatasetSplit has no parent task")
688
689        runs = parent.runs()
690        all_ids = set(run.id for run in runs)
691        all_ids_in_splits = set()
692        for ids in self.split_contents.values():
693            all_ids_in_splits.update(ids)
694        missing = all_ids_in_splits - all_ids
695        return len(missing)
696
697
698class TaskRequirement(BaseModel):
699    """
700    Defines a specific requirement that should be met by task outputs.
701
702    Includes an identifier, name, description, instruction for meeting the requirement,
703    priority level, and rating type (five_star, pass_fail, pass_fail_critical, custom).
704    """
705
706    id: ID_TYPE = ID_FIELD
707    name: str = SHORT_NAME_FIELD
708    description: str | None = Field(default=None)
709    instruction: str = Field(min_length=1)
710    priority: Priority = Field(default=Priority.p2)
711    type: TaskOutputRatingType = Field(default=TaskOutputRatingType.five_star)
712
713
714class TaskDeterminism(str, Enum):
715    """
716    Defines how strictly task outputs should match expected results.
717
718    - deterministic: Requires exact matches
719    - semantic_match: Allows different wording with same meaning
720    - flexible: Allows variation in both wording and meaning within requirements
721    """
722
723    deterministic = "deterministic"  # Expect exact match
724    semantic_match = "semantic_match"  # Expect same meaning, but flexible on expression of the meaning
725    flexible = "flexible"  # Flexible on semantic output. Eval should be custom based on parsing requirements.
726
727
728class Task(
729    KilnParentedModel,
730    KilnParentModel,
731    parent_of={
732        "runs": TaskRun,
733        "dataset_splits": DatasetSplit,
734        "finetunes": Finetune,
735    },
736):
737    """
738    Represents a specific task to be performed, with associated requirements and validation rules.
739
740    Contains the task definition, requirements, input/output schemas, and maintains
741    a collection of task runs.
742    """
743
744    name: str = NAME_FIELD
745    description: str | None = Field(
746        default=None,
747        description="A description of the task for you and your team. Will not be used in prompts/training/validation.",
748    )
749    instruction: str = Field(
750        min_length=1,
751        description="The instructions for the task. Will be used in prompts/training/validation.",
752    )
753    requirements: List[TaskRequirement] = Field(default=[])
754    output_json_schema: JsonObjectSchema | None = None
755    input_json_schema: JsonObjectSchema | None = None
756    thinking_instruction: str | None = Field(
757        default=None,
758        description="Instructions for the model 'thinking' about the requirement prior to answering. Used for chain of thought style prompting.",
759    )
760
761    def output_schema(self) -> Dict | None:
762        if self.output_json_schema is None:
763            return None
764        return schema_from_json_str(self.output_json_schema)
765
766    def input_schema(self) -> Dict | None:
767        if self.input_json_schema is None:
768            return None
769        return schema_from_json_str(self.input_json_schema)
770
771    # Needed for typechecking. TODO P2: fix this in KilnParentModel
772    def runs(self) -> list[TaskRun]:
773        return super().runs()  # type: ignore
774
775    def dataset_splits(self) -> list[DatasetSplit]:
776        return super().dataset_splits()  # type: ignore
777
778    def finetunes(self) -> list[Finetune]:
779        return super().finetunes()  # type: ignore
780
781
782class Project(KilnParentModel, parent_of={"tasks": Task}):
783    """
784    A collection of related tasks.
785
786    Projects organize tasks into logical groups and provide high-level descriptions
787    of the overall goals.
788    """
789
790    name: str = NAME_FIELD
791    description: str | None = Field(
792        default=None,
793        description="A description of the project for you and your team. Will not be used in prompts/training/validation.",
794    )
795
796    # Needed for typechecking. TODO P2: fix this in KilnParentModel
797    def tasks(self) -> list[Task]:
798        return super().tasks()  # type: ignore
729class Task(
730    KilnParentedModel,
731    KilnParentModel,
732    parent_of={
733        "runs": TaskRun,
734        "dataset_splits": DatasetSplit,
735        "finetunes": Finetune,
736    },
737):
738    """
739    Represents a specific task to be performed, with associated requirements and validation rules.
740
741    Contains the task definition, requirements, input/output schemas, and maintains
742    a collection of task runs.
743    """
744
745    name: str = NAME_FIELD
746    description: str | None = Field(
747        default=None,
748        description="A description of the task for you and your team. Will not be used in prompts/training/validation.",
749    )
750    instruction: str = Field(
751        min_length=1,
752        description="The instructions for the task. Will be used in prompts/training/validation.",
753    )
754    requirements: List[TaskRequirement] = Field(default=[])
755    output_json_schema: JsonObjectSchema | None = None
756    input_json_schema: JsonObjectSchema | None = None
757    thinking_instruction: str | None = Field(
758        default=None,
759        description="Instructions for the model 'thinking' about the requirement prior to answering. Used for chain of thought style prompting.",
760    )
761
762    def output_schema(self) -> Dict | None:
763        if self.output_json_schema is None:
764            return None
765        return schema_from_json_str(self.output_json_schema)
766
767    def input_schema(self) -> Dict | None:
768        if self.input_json_schema is None:
769            return None
770        return schema_from_json_str(self.input_json_schema)
771
772    # Needed for typechecking. TODO P2: fix this in KilnParentModel
773    def runs(self) -> list[TaskRun]:
774        return super().runs()  # type: ignore
775
776    def dataset_splits(self) -> list[DatasetSplit]:
777        return super().dataset_splits()  # type: ignore
778
779    def finetunes(self) -> list[Finetune]:
780        return super().finetunes()  # type: ignore

Represents a specific task to be performed, with associated requirements and validation rules.

Contains the task definition, requirements, input/output schemas, and maintains a collection of task runs.

name: str
description: str | None
instruction: str
requirements: List[TaskRequirement]
output_json_schema: Optional[Annotated[str, AfterValidator(func=<function <lambda> at 0x7f02f676c0e0>)]]
input_json_schema: Optional[Annotated[str, AfterValidator(func=<function <lambda> at 0x7f02f676c0e0>)]]
thinking_instruction: str | None
def output_schema(self) -> Optional[Dict]:
762    def output_schema(self) -> Dict | None:
763        if self.output_json_schema is None:
764            return None
765        return schema_from_json_str(self.output_json_schema)
def input_schema(self) -> Optional[Dict]:
767    def input_schema(self) -> Dict | None:
768        if self.input_json_schema is None:
769            return None
770        return schema_from_json_str(self.input_json_schema)
def runs(self) -> List[TaskRun]:
398        def child_method(self) -> list[child_class]:
399            return child_class.all_children_of_parent_path(self.path)
def dataset_splits(self) -> List[DatasetSplit]:
398        def child_method(self) -> list[child_class]:
399            return child_class.all_children_of_parent_path(self.path)
def finetunes(self) -> List[Finetune]:
398        def child_method(self) -> list[child_class]:
399            return child_class.all_children_of_parent_path(self.path)
def relationship_name() -> str:
416        def relationship_name_method() -> str:
417            return relationship_name
def parent_type() -> Type[kiln_ai.datamodel.basemodel.KilnParentModel]:
409        def parent_class_method() -> Type[KilnParentModel]:
410            return cls
model_config = {'validate_assignment': True}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

def model_post_init(self: pydantic.main.BaseModel, context: Any, /) -> None:
122                    def wrapped_model_post_init(self: BaseModel, context: Any, /) -> None:
123                        """We need to both initialize private attributes and call the user-defined model_post_init
124                        method.
125                        """
126                        init_private_attributes(self, context)
127                        original_model_post_init(self, context)

We need to both initialize private attributes and call the user-defined model_post_init method.

class Project(kiln_ai.datamodel.basemodel.KilnParentModel):
783class Project(KilnParentModel, parent_of={"tasks": Task}):
784    """
785    A collection of related tasks.
786
787    Projects organize tasks into logical groups and provide high-level descriptions
788    of the overall goals.
789    """
790
791    name: str = NAME_FIELD
792    description: str | None = Field(
793        default=None,
794        description="A description of the project for you and your team. Will not be used in prompts/training/validation.",
795    )
796
797    # Needed for typechecking. TODO P2: fix this in KilnParentModel
798    def tasks(self) -> list[Task]:
799        return super().tasks()  # type: ignore

A collection of related tasks.

Projects organize tasks into logical groups and provide high-level descriptions of the overall goals.

name: str
description: str | None
def tasks(self) -> List[Task]:
398        def child_method(self) -> list[child_class]:
399            return child_class.all_children_of_parent_path(self.path)
model_config = {'validate_assignment': True}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

def model_post_init(self: pydantic.main.BaseModel, context: Any, /) -> None:
122                    def wrapped_model_post_init(self: BaseModel, context: Any, /) -> None:
123                        """We need to both initialize private attributes and call the user-defined model_post_init
124                        method.
125                        """
126                        init_private_attributes(self, context)
127                        original_model_post_init(self, context)

We need to both initialize private attributes and call the user-defined model_post_init method.

class TaskRun(kiln_ai.datamodel.basemodel.KilnParentedModel):
441class TaskRun(KilnParentedModel):
442    """
443    Represents a single execution of a Task.
444
445    Contains the input used, its source, the output produced, and optional
446    repair information if the output needed correction.
447    """
448
449    input: str = Field(
450        description="The inputs to the task. JSON formatted for structured input, plaintext for unstructured input."
451    )
452    input_source: DataSource | None = Field(
453        default=None, description="The source of the input: human or synthetic."
454    )
455
456    output: TaskOutput = Field(description="The output of the task run.")
457    repair_instructions: str | None = Field(
458        default=None,
459        description="Instructions for fixing the output. Should define what is wrong, and how to fix it. Will be used by models for both generating a fixed output, and evaluating future models.",
460    )
461    repaired_output: TaskOutput | None = Field(
462        default=None,
463        description="An version of the output with issues fixed. This must be a 'fixed' version of the existing output, and not an entirely new output. If you wish to generate an ideal curatorial output for this task unrelated to this output, generate a new TaskOutput with type 'human' instead of using this field.",
464    )
465    intermediate_outputs: Dict[str, str] | None = Field(
466        default=None,
467        description="Intermediate outputs from the task run. Keys are the names of the intermediate output steps (cot=chain of thought, etc), values are the output data.",
468    )
469    tags: List[str] = Field(
470        default=[],
471        description="Tags for the task run. Tags are used to categorize task runs for filtering and reporting.",
472    )
473
474    def parent_task(self) -> Task | None:
475        if not isinstance(self.parent, Task):
476            return None
477        return self.parent
478
479    @model_validator(mode="after")
480    def validate_input_format(self) -> Self:
481        task = self.parent_task()
482        if task is None:
483            # don't validate this relationship until we have a path or parent. Give them time to build it (but will catch it before saving)
484            return self
485
486        # validate output
487        if task.input_json_schema is not None:
488            try:
489                validate_schema(json.loads(self.input), task.input_json_schema)
490            except json.JSONDecodeError:
491                raise ValueError("Input is not a valid JSON object")
492            except jsonschema.exceptions.ValidationError as e:
493                raise ValueError(f"Input does not match task input schema: {e}")
494        return self
495
496    @model_validator(mode="after")
497    def validate_output_format(self) -> Self:
498        task = self.parent_task()
499        if task is None:
500            return self
501
502        self.output.validate_output_format(task)
503        return self
504
505    @model_validator(mode="after")
506    def validate_repaired_output(self) -> Self:
507        if self.repaired_output is not None:
508            if self.repaired_output.rating is not None:
509                raise ValueError(
510                    "Repaired output rating must be None. Repaired outputs are assumed to have a perfect rating, as they have been fixed."
511                )
512        if self.repair_instructions is None and self.repaired_output is not None:
513            raise ValueError(
514                "Repair instructions are required if providing a repaired output."
515            )
516        if self.repair_instructions is not None and self.repaired_output is None:
517            raise ValueError(
518                "A repaired output is required if providing repair instructions."
519            )
520        return self
521
522    @model_validator(mode="after")
523    def validate_input_source(self, info: ValidationInfo) -> Self:
524        # On strict mode and not loaded from file, we validate input_source is not None.
525        # We want to be able to load any data, even if it's not perfect. But we want to create perfect data when adding new data.
526        if not strict_mode():
527            return self
528        if self.loaded_from_file(info):
529            return self
530        if self.input_source is None:
531            raise ValueError("input_source is required when strict mode is enabled")
532        return self
533
534    @model_validator(mode="after")
535    def validate_tags(self) -> Self:
536        for tag in self.tags:
537            if not tag:
538                raise ValueError("Tags cannot be empty strings")
539            if " " in tag:
540                raise ValueError("Tags cannot contain spaces. Try underscores.")
541
542        return self

Represents a single execution of a Task.

Contains the input used, its source, the output produced, and optional repair information if the output needed correction.

input: str
input_source: DataSource | None
output: TaskOutput
repair_instructions: str | None
repaired_output: TaskOutput | None
intermediate_outputs: Optional[Dict[str, str]]
tags: List[str]
def parent_task(self) -> Task | None:
474    def parent_task(self) -> Task | None:
475        if not isinstance(self.parent, Task):
476            return None
477        return self.parent
@model_validator(mode='after')
def validate_input_format(self) -> Self:
479    @model_validator(mode="after")
480    def validate_input_format(self) -> Self:
481        task = self.parent_task()
482        if task is None:
483            # don't validate this relationship until we have a path or parent. Give them time to build it (but will catch it before saving)
484            return self
485
486        # validate output
487        if task.input_json_schema is not None:
488            try:
489                validate_schema(json.loads(self.input), task.input_json_schema)
490            except json.JSONDecodeError:
491                raise ValueError("Input is not a valid JSON object")
492            except jsonschema.exceptions.ValidationError as e:
493                raise ValueError(f"Input does not match task input schema: {e}")
494        return self
@model_validator(mode='after')
def validate_output_format(self) -> Self:
496    @model_validator(mode="after")
497    def validate_output_format(self) -> Self:
498        task = self.parent_task()
499        if task is None:
500            return self
501
502        self.output.validate_output_format(task)
503        return self
@model_validator(mode='after')
def validate_repaired_output(self) -> Self:
505    @model_validator(mode="after")
506    def validate_repaired_output(self) -> Self:
507        if self.repaired_output is not None:
508            if self.repaired_output.rating is not None:
509                raise ValueError(
510                    "Repaired output rating must be None. Repaired outputs are assumed to have a perfect rating, as they have been fixed."
511                )
512        if self.repair_instructions is None and self.repaired_output is not None:
513            raise ValueError(
514                "Repair instructions are required if providing a repaired output."
515            )
516        if self.repair_instructions is not None and self.repaired_output is None:
517            raise ValueError(
518                "A repaired output is required if providing repair instructions."
519            )
520        return self
@model_validator(mode='after')
def validate_input_source(self, info: pydantic_core.core_schema.ValidationInfo) -> Self:
522    @model_validator(mode="after")
523    def validate_input_source(self, info: ValidationInfo) -> Self:
524        # On strict mode and not loaded from file, we validate input_source is not None.
525        # We want to be able to load any data, even if it's not perfect. But we want to create perfect data when adding new data.
526        if not strict_mode():
527            return self
528        if self.loaded_from_file(info):
529            return self
530        if self.input_source is None:
531            raise ValueError("input_source is required when strict mode is enabled")
532        return self
@model_validator(mode='after')
def validate_tags(self) -> Self:
534    @model_validator(mode="after")
535    def validate_tags(self) -> Self:
536        for tag in self.tags:
537            if not tag:
538                raise ValueError("Tags cannot be empty strings")
539            if " " in tag:
540                raise ValueError("Tags cannot contain spaces. Try underscores.")
541
542        return self
def relationship_name() -> str:
416        def relationship_name_method() -> str:
417            return relationship_name
def parent_type() -> Type[kiln_ai.datamodel.basemodel.KilnParentModel]:
409        def parent_class_method() -> Type[KilnParentModel]:
410            return cls
model_config = {'validate_assignment': True}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

def model_post_init(self: pydantic.main.BaseModel, context: Any, /) -> None:
122                    def wrapped_model_post_init(self: BaseModel, context: Any, /) -> None:
123                        """We need to both initialize private attributes and call the user-defined model_post_init
124                        method.
125                        """
126                        init_private_attributes(self, context)
127                        original_model_post_init(self, context)

We need to both initialize private attributes and call the user-defined model_post_init method.

class TaskOutput(kiln_ai.datamodel.basemodel.KilnBaseModel):
223class TaskOutput(KilnBaseModel):
224    """
225    An output for a specific task run.
226
227    Contains the actual output content, its source (human or synthetic),
228    and optional rating information.
229    """
230
231    output: str = Field(
232        description="The output of the task. JSON formatted for structured output, plaintext for unstructured output."
233    )
234    source: DataSource | None = Field(
235        description="The source of the output: human or synthetic.",
236        default=None,
237    )
238    rating: TaskOutputRating | None = Field(
239        default=None, description="The rating of the output"
240    )
241
242    def validate_output_format(self, task: Task) -> Self:
243        # validate output
244        if task.output_json_schema is not None:
245            try:
246                validate_schema(json.loads(self.output), task.output_json_schema)
247            except json.JSONDecodeError:
248                raise ValueError("Output is not a valid JSON object")
249            except jsonschema.exceptions.ValidationError as e:
250                raise ValueError(f"Output does not match task output schema: {e}")
251        return self
252
253    @model_validator(mode="after")
254    def validate_output_source(self, info: ValidationInfo) -> Self:
255        # On strict mode and not loaded from file, we validate output_source is not None.
256        # We want to be able to load any data, even if it's not perfect. But we want to create perfect data when adding new data.
257        if not strict_mode():
258            return self
259        if self.loaded_from_file(info):
260            return self
261        if self.source is None:
262            raise ValueError("Output source is required when strict mode is enabled")
263        return self

An output for a specific task run.

Contains the actual output content, its source (human or synthetic), and optional rating information.

output: str
source: DataSource | None
rating: TaskOutputRating | None
def validate_output_format(self, task: Task) -> Self:
242    def validate_output_format(self, task: Task) -> Self:
243        # validate output
244        if task.output_json_schema is not None:
245            try:
246                validate_schema(json.loads(self.output), task.output_json_schema)
247            except json.JSONDecodeError:
248                raise ValueError("Output is not a valid JSON object")
249            except jsonschema.exceptions.ValidationError as e:
250                raise ValueError(f"Output does not match task output schema: {e}")
251        return self
@model_validator(mode='after')
def validate_output_source(self, info: pydantic_core.core_schema.ValidationInfo) -> Self:
253    @model_validator(mode="after")
254    def validate_output_source(self, info: ValidationInfo) -> Self:
255        # On strict mode and not loaded from file, we validate output_source is not None.
256        # We want to be able to load any data, even if it's not perfect. But we want to create perfect data when adding new data.
257        if not strict_mode():
258            return self
259        if self.loaded_from_file(info):
260            return self
261        if self.source is None:
262            raise ValueError("Output source is required when strict mode is enabled")
263        return self
model_config = {'validate_assignment': True}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

def model_post_init(self: pydantic.main.BaseModel, context: Any, /) -> None:
122                    def wrapped_model_post_init(self: BaseModel, context: Any, /) -> None:
123                        """We need to both initialize private attributes and call the user-defined model_post_init
124                        method.
125                        """
126                        init_private_attributes(self, context)
127                        original_model_post_init(self, context)

We need to both initialize private attributes and call the user-defined model_post_init method.

class TaskOutputRating(kiln_ai.datamodel.basemodel.KilnBaseModel):
109class TaskOutputRating(KilnBaseModel):
110    """
111    A rating for a task output, including an overall rating and ratings for each requirement.
112
113    Supports:
114    - five_star: 1-5 star ratings
115    - pass_fail: boolean pass/fail (1.0 = pass, 0.0 = fail)
116    - pass_fail_critical: tri-state (1.0 = pass, 0.0 = fail, -1.0 = critical fail)
117    """
118
119    type: TaskOutputRatingType = Field(default=TaskOutputRatingType.five_star)
120    value: float | None = Field(
121        description="The rating value. Interpretation depends on rating type:\n- five_star: 1-5 stars\n- pass_fail: 1.0 (pass) or 0.0 (fail)\n- pass_fail_critical: 1.0 (pass), 0.0 (fail), or -1.0 (critical fail)",
122        default=None,
123    )
124    requirement_ratings: Dict[ID_TYPE, RequirementRating] = Field(
125        default={},
126        description="The ratings of the requirements of the task.",
127    )
128
129    # Previously we stored rating values as a dict of floats, but now we store them as RequirementRating objects.
130    @model_validator(mode="before")
131    def upgrade_old_format(cls, data: dict) -> dict:
132        if not isinstance(data, dict):
133            return data
134
135        # Check if we have the old format (dict of floats)
136        req_ratings = data.get("requirement_ratings", {})
137        if req_ratings and all(
138            isinstance(v, (int, float)) for v in req_ratings.values()
139        ):
140            # Convert each float to a RequirementRating object
141            # all ratings are five star at the point we used this format
142            data["requirement_ratings"] = {
143                k: {"value": v, "type": TaskOutputRatingType.five_star}
144                for k, v in req_ratings.items()
145            }
146
147        return data
148
149    # Used to select high quality outputs for example selection (MultiShotPromptBuilder, etc)
150    def is_high_quality(self) -> bool:
151        if self.value is None:
152            return False
153
154        if self.type == TaskOutputRatingType.five_star:
155            return self.value >= 4
156        elif self.type == TaskOutputRatingType.pass_fail:
157            return self.value == 1.0
158        elif self.type == TaskOutputRatingType.pass_fail_critical:
159            return self.value == 1.0
160        return False
161
162    @model_validator(mode="after")
163    def validate_rating(self) -> Self:
164        if self.type not in TaskOutputRatingType:
165            raise ValueError(f"Invalid rating type: {self.type}")
166
167        # Overall rating is optional
168        if self.value is not None:
169            self._validate_rating(self.type, self.value, "overall rating")
170
171        for req_id, req_rating in self.requirement_ratings.items():
172            self._validate_rating(
173                req_rating.type,
174                req_rating.value,
175                f"requirement rating for req ID: {req_id}",
176            )
177
178        return self
179
180    def _validate_rating(
181        self, type: TaskOutputRatingType, rating: float | None, rating_name: str
182    ) -> None:
183        if type == TaskOutputRatingType.five_star:
184            self._validate_five_star(rating, rating_name)
185        elif type == TaskOutputRatingType.pass_fail:
186            self._validate_pass_fail(rating, rating_name)
187        elif type == TaskOutputRatingType.pass_fail_critical:
188            self._validate_pass_fail_critical(rating, rating_name)
189
190    def _validate_five_star(self, rating: float | None, rating_name: str) -> None:
191        if rating is None or not isinstance(rating, float) or not rating.is_integer():
192            raise ValueError(
193                f"{rating_name.capitalize()} of type five_star must be an integer value (1-5)"
194            )
195        if rating < 1 or rating > 5:
196            raise ValueError(
197                f"{rating_name.capitalize()} of type five_star must be between 1 and 5 stars"
198            )
199
200    def _validate_pass_fail(self, rating: float | None, rating_name: str) -> None:
201        if rating is None or not isinstance(rating, float) or not rating.is_integer():
202            raise ValueError(
203                f"{rating_name.capitalize()} of type pass_fail must be an integer value (0 or 1)"
204            )
205        if rating not in [0, 1]:
206            raise ValueError(
207                f"{rating_name.capitalize()} of type pass_fail must be 0 (fail) or 1 (pass)"
208            )
209
210    def _validate_pass_fail_critical(
211        self, rating: float | None, rating_name: str
212    ) -> None:
213        if rating is None or not isinstance(rating, float) or not rating.is_integer():
214            raise ValueError(
215                f"{rating_name.capitalize()} of type pass_fail_critical must be an integer value (-1, 0, or 1)"
216            )
217        if rating not in [-1, 0, 1]:
218            raise ValueError(
219                f"{rating_name.capitalize()} of type pass_fail_critical must be -1 (critical fail), 0 (fail), or 1 (pass)"
220            )

A rating for a task output, including an overall rating and ratings for each requirement.

Supports:

  • five_star: 1-5 star ratings
  • pass_fail: boolean pass/fail (1.0 = pass, 0.0 = fail)
  • pass_fail_critical: tri-state (1.0 = pass, 0.0 = fail, -1.0 = critical fail)
value: float | None
requirement_ratings: Dict[Optional[str], RequirementRating]
@model_validator(mode='before')
def upgrade_old_format(cls, data: dict) -> dict:
130    @model_validator(mode="before")
131    def upgrade_old_format(cls, data: dict) -> dict:
132        if not isinstance(data, dict):
133            return data
134
135        # Check if we have the old format (dict of floats)
136        req_ratings = data.get("requirement_ratings", {})
137        if req_ratings and all(
138            isinstance(v, (int, float)) for v in req_ratings.values()
139        ):
140            # Convert each float to a RequirementRating object
141            # all ratings are five star at the point we used this format
142            data["requirement_ratings"] = {
143                k: {"value": v, "type": TaskOutputRatingType.five_star}
144                for k, v in req_ratings.items()
145            }
146
147        return data
def is_high_quality(self) -> bool:
150    def is_high_quality(self) -> bool:
151        if self.value is None:
152            return False
153
154        if self.type == TaskOutputRatingType.five_star:
155            return self.value >= 4
156        elif self.type == TaskOutputRatingType.pass_fail:
157            return self.value == 1.0
158        elif self.type == TaskOutputRatingType.pass_fail_critical:
159            return self.value == 1.0
160        return False
@model_validator(mode='after')
def validate_rating(self) -> Self:
162    @model_validator(mode="after")
163    def validate_rating(self) -> Self:
164        if self.type not in TaskOutputRatingType:
165            raise ValueError(f"Invalid rating type: {self.type}")
166
167        # Overall rating is optional
168        if self.value is not None:
169            self._validate_rating(self.type, self.value, "overall rating")
170
171        for req_id, req_rating in self.requirement_ratings.items():
172            self._validate_rating(
173                req_rating.type,
174                req_rating.value,
175                f"requirement rating for req ID: {req_id}",
176            )
177
178        return self
model_config = {'validate_assignment': True}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

def model_post_init(self: pydantic.main.BaseModel, context: Any, /) -> None:
122                    def wrapped_model_post_init(self: BaseModel, context: Any, /) -> None:
123                        """We need to both initialize private attributes and call the user-defined model_post_init
124                        method.
125                        """
126                        init_private_attributes(self, context)
127                        original_model_post_init(self, context)

We need to both initialize private attributes and call the user-defined model_post_init method.

class Priority(enum.IntEnum):
81class Priority(IntEnum):
82    """Defines priority levels for tasks and requirements, where P0 is highest priority."""
83
84    p0 = 0
85    p1 = 1
86    p2 = 2
87    p3 = 3

Defines priority levels for tasks and requirements, where P0 is highest priority.

p0 = <Priority.p0: 0>
p1 = <Priority.p1: 1>
p2 = <Priority.p2: 2>
p3 = <Priority.p3: 3>
class DataSource(pydantic.main.BaseModel):
357class DataSource(BaseModel):
358    """
359    Represents the origin of data, either human or synthetic, with associated properties.
360
361    Properties vary based on the source type - for synthetic sources this includes
362    model information, for human sources this includes creator information.
363    """
364
365    type: DataSourceType
366    properties: Dict[str, str | int | float] = Field(
367        default={},
368        description="Properties describing the data source. For synthetic things like model. For human, the human's name.",
369    )
370
371    _data_source_properties = [
372        DataSourceProperty(
373            name="created_by",
374            type=str,
375            required_for=[DataSourceType.human],
376            not_allowed_for=[DataSourceType.synthetic],
377        ),
378        DataSourceProperty(
379            name="model_name",
380            type=str,
381            required_for=[DataSourceType.synthetic],
382            not_allowed_for=[DataSourceType.human],
383        ),
384        DataSourceProperty(
385            name="model_provider",
386            type=str,
387            required_for=[DataSourceType.synthetic],
388            not_allowed_for=[DataSourceType.human],
389        ),
390        DataSourceProperty(
391            name="adapter_name",
392            type=str,
393            required_for=[DataSourceType.synthetic],
394            not_allowed_for=[DataSourceType.human],
395        ),
396        DataSourceProperty(
397            name="prompt_builder_name",
398            type=str,
399            not_allowed_for=[DataSourceType.human],
400        ),
401    ]
402
403    @model_validator(mode="after")
404    def validate_type(self) -> "DataSource":
405        if self.type not in DataSourceType:
406            raise ValueError(f"Invalid data source type: {self.type}")
407        return self
408
409    @model_validator(mode="after")
410    def validate_properties(self) -> "DataSource":
411        for prop in self._data_source_properties:
412            # Check the property type is correct
413            if prop.name in self.properties:
414                if not isinstance(self.properties[prop.name], prop.type):
415                    raise ValueError(
416                        f"'{prop.name}' must be of type {prop.type.__name__} for {self.type} data source"
417                    )
418            # Check the property is required for the data source type
419            if self.type in prop.required_for:
420                if prop.name not in self.properties:
421                    raise ValueError(
422                        f"'{prop.name}' is required for {self.type} data source"
423                    )
424            # Check the property is not allowed for the data source type
425            elif self.type in prop.not_allowed_for and prop.name in self.properties:
426                raise ValueError(
427                    f"'{prop.name}' is not allowed for {self.type} data source"
428                )
429        return self
430
431    @model_validator(mode="after")
432    def validate_no_empty_properties(self) -> Self:
433        for prop, value in self.properties.items():
434            if isinstance(value, str) and value == "":
435                raise ValueError(
436                    f"Property '{prop}' must be a non-empty string for {self.type} data source"
437                )
438        return self

Represents the origin of data, either human or synthetic, with associated properties.

Properties vary based on the source type - for synthetic sources this includes model information, for human sources this includes creator information.

properties: Dict[str, str | int | float]
@model_validator(mode='after')
def validate_type(self) -> DataSource:
403    @model_validator(mode="after")
404    def validate_type(self) -> "DataSource":
405        if self.type not in DataSourceType:
406            raise ValueError(f"Invalid data source type: {self.type}")
407        return self
@model_validator(mode='after')
def validate_properties(self) -> DataSource:
409    @model_validator(mode="after")
410    def validate_properties(self) -> "DataSource":
411        for prop in self._data_source_properties:
412            # Check the property type is correct
413            if prop.name in self.properties:
414                if not isinstance(self.properties[prop.name], prop.type):
415                    raise ValueError(
416                        f"'{prop.name}' must be of type {prop.type.__name__} for {self.type} data source"
417                    )
418            # Check the property is required for the data source type
419            if self.type in prop.required_for:
420                if prop.name not in self.properties:
421                    raise ValueError(
422                        f"'{prop.name}' is required for {self.type} data source"
423                    )
424            # Check the property is not allowed for the data source type
425            elif self.type in prop.not_allowed_for and prop.name in self.properties:
426                raise ValueError(
427                    f"'{prop.name}' is not allowed for {self.type} data source"
428                )
429        return self
@model_validator(mode='after')
def validate_no_empty_properties(self) -> Self:
431    @model_validator(mode="after")
432    def validate_no_empty_properties(self) -> Self:
433        for prop, value in self.properties.items():
434            if isinstance(value, str) and value == "":
435                raise ValueError(
436                    f"Property '{prop}' must be a non-empty string for {self.type} data source"
437                )
438        return self
model_config: ClassVar[pydantic.config.ConfigDict] = {}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

def model_post_init(self: pydantic.main.BaseModel, context: Any, /) -> None:
384def init_private_attributes(self: BaseModel, context: Any, /) -> None:
385    """This function is meant to behave like a BaseModel method to initialise private attributes.
386
387    It takes context as an argument since that's what pydantic-core passes when calling it.
388
389    Args:
390        self: The BaseModel instance.
391        context: The context.
392    """
393    if getattr(self, '__pydantic_private__', None) is None:
394        pydantic_private = {}
395        for name, private_attr in self.__private_attributes__.items():
396            default = private_attr.get_default()
397            if default is not PydanticUndefined:
398                pydantic_private[name] = default
399        object_setattr(self, '__pydantic_private__', pydantic_private)

This function is meant to behave like a BaseModel method to initialise private attributes.

It takes context as an argument since that's what pydantic-core passes when calling it.

Args: self: The BaseModel instance. context: The context.

class DataSourceType(builtins.str, enum.Enum):
331class DataSourceType(str, Enum):
332    """
333    The source type of a piece of data.
334
335    Human: a human created the data
336    Synthetic: a model created the data
337    """
338
339    human = "human"
340    synthetic = "synthetic"

The source type of a piece of data.

Human: a human created the data Synthetic: a model created the data

human = <DataSourceType.human: 'human'>
synthetic = <DataSourceType.synthetic: 'synthetic'>
class DataSourceProperty(pydantic.main.BaseModel):
343class DataSourceProperty(BaseModel):
344    """
345    Defines a property that can be associated with a data source.
346
347    Includes validation rules for when properties are required or not allowed
348    based on the data source type.
349    """
350
351    name: str
352    type: Type[Union[str, int, float]]
353    required_for: List[DataSourceType] = []
354    not_allowed_for: List[DataSourceType] = []

Defines a property that can be associated with a data source.

Includes validation rules for when properties are required or not allowed based on the data source type.

name: str
type: Type[Union[str, int, float]]
required_for: List[DataSourceType]
not_allowed_for: List[DataSourceType]
model_config: ClassVar[pydantic.config.ConfigDict] = {}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

class Finetune(kiln_ai.datamodel.basemodel.KilnParentedModel):
278class Finetune(KilnParentedModel):
279    name: str = NAME_FIELD
280    description: str | None = Field(
281        default=None,
282        description="A description of the fine-tune for you and your team. Not used in training.",
283    )
284    provider: str = Field(
285        description="The provider to use for the fine-tune (e.g. 'openai')."
286    )
287    base_model_id: str = Field(
288        description="The id of the base model to use for the fine-tune. This string relates to the provider's IDs for their own models, not Kiln IDs."
289    )
290    provider_id: str | None = Field(
291        default=None,
292        description="The ID of the fine-tune job on the provider's side. May not be the same as the fine_tune_model_id.",
293    )
294    fine_tune_model_id: str | None = Field(
295        default=None,
296        description="The ID of the fine-tuned model on the provider's side. May not be the same as the provider_id.",
297    )
298    dataset_split_id: str = Field(
299        description="The ID of the dataset split to use for this fine-tune.",
300    )
301    train_split_name: str = Field(
302        default="train",
303        description="The name of the training split to use for this fine-tune.",
304    )
305    validation_split_name: str | None = Field(
306        default=None,
307        description="The name of the validation split to use for this fine-tune. Optional.",
308    )
309    parameters: dict[str, str | int | float | bool] = Field(
310        default={},
311        description="The parameters to use for this fine-tune. These are provider-specific.",
312    )
313    system_message: str = Field(
314        description="The system message to use for this fine-tune.",
315    )
316    latest_status: FineTuneStatusType = Field(
317        default=FineTuneStatusType.unknown,
318        description="The latest known status of this fine-tune. Not updated in real time.",
319    )
320    properties: Dict[str, str | int | float] = Field(
321        default={},
322        description="Properties of the fine-tune. Different providers may use different properties.",
323    )
324
325    def parent_task(self) -> Task | None:
326        if not isinstance(self.parent, Task):
327            return None
328        return self.parent

Base model for Kiln models that have a parent-child relationship. This base class is for child models.

This class provides functionality for managing hierarchical relationships between models, including parent reference handling and file system organization.

Attributes: parent (KilnBaseModel): Reference to the parent model instance. Not persisted, just in memory.

name: str
description: str | None
provider: str
base_model_id: str
provider_id: str | None
fine_tune_model_id: str | None
dataset_split_id: str
train_split_name: str
validation_split_name: str | None
parameters: dict[str, str | int | float | bool]
system_message: str
latest_status: FineTuneStatusType
properties: Dict[str, str | int | float]
def parent_task(self) -> Task | None:
325    def parent_task(self) -> Task | None:
326        if not isinstance(self.parent, Task):
327            return None
328        return self.parent
def relationship_name() -> str:
416        def relationship_name_method() -> str:
417            return relationship_name
def parent_type() -> Type[kiln_ai.datamodel.basemodel.KilnParentModel]:
409        def parent_class_method() -> Type[KilnParentModel]:
410            return cls
model_config = {'validate_assignment': True}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

def model_post_init(self: pydantic.main.BaseModel, context: Any, /) -> None:
122                    def wrapped_model_post_init(self: BaseModel, context: Any, /) -> None:
123                        """We need to both initialize private attributes and call the user-defined model_post_init
124                        method.
125                        """
126                        init_private_attributes(self, context)
127                        original_model_post_init(self, context)

We need to both initialize private attributes and call the user-defined model_post_init method.

class FineTuneStatusType(builtins.str, enum.Enum):
266class FineTuneStatusType(str, Enum):
267    """
268    The status type of a fine-tune (running, completed, failed, etc).
269    """
270
271    unknown = "unknown"  # server error
272    pending = "pending"
273    running = "running"
274    completed = "completed"
275    failed = "failed"

The status type of a fine-tune (running, completed, failed, etc).

unknown = <FineTuneStatusType.unknown: 'unknown'>
pending = <FineTuneStatusType.pending: 'pending'>
running = <FineTuneStatusType.running: 'running'>
completed = <FineTuneStatusType.completed: 'completed'>
failed = <FineTuneStatusType.failed: 'failed'>
class TaskOutputRatingType(builtins.str, enum.Enum):
91class TaskOutputRatingType(str, Enum):
92    """Defines the types of rating systems available for task outputs."""
93
94    five_star = "five_star"
95    pass_fail = "pass_fail"
96    pass_fail_critical = "pass_fail_critical"
97    custom = "custom"

Defines the types of rating systems available for task outputs.

five_star = <TaskOutputRatingType.five_star: 'five_star'>
pass_fail = <TaskOutputRatingType.pass_fail: 'pass_fail'>
pass_fail_critical = <TaskOutputRatingType.pass_fail_critical: 'pass_fail_critical'>
custom = <TaskOutputRatingType.custom: 'custom'>
class TaskRequirement(pydantic.main.BaseModel):
699class TaskRequirement(BaseModel):
700    """
701    Defines a specific requirement that should be met by task outputs.
702
703    Includes an identifier, name, description, instruction for meeting the requirement,
704    priority level, and rating type (five_star, pass_fail, pass_fail_critical, custom).
705    """
706
707    id: ID_TYPE = ID_FIELD
708    name: str = SHORT_NAME_FIELD
709    description: str | None = Field(default=None)
710    instruction: str = Field(min_length=1)
711    priority: Priority = Field(default=Priority.p2)
712    type: TaskOutputRatingType = Field(default=TaskOutputRatingType.five_star)

Defines a specific requirement that should be met by task outputs.

Includes an identifier, name, description, instruction for meeting the requirement, priority level, and rating type (five_star, pass_fail, pass_fail_critical, custom).

id: Optional[str]
name: str
description: str | None
instruction: str
priority: Priority
model_config: ClassVar[pydantic.config.ConfigDict] = {}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

class TaskDeterminism(builtins.str, enum.Enum):
715class TaskDeterminism(str, Enum):
716    """
717    Defines how strictly task outputs should match expected results.
718
719    - deterministic: Requires exact matches
720    - semantic_match: Allows different wording with same meaning
721    - flexible: Allows variation in both wording and meaning within requirements
722    """
723
724    deterministic = "deterministic"  # Expect exact match
725    semantic_match = "semantic_match"  # Expect same meaning, but flexible on expression of the meaning
726    flexible = "flexible"  # Flexible on semantic output. Eval should be custom based on parsing requirements.

Defines how strictly task outputs should match expected results.

  • deterministic: Requires exact matches
  • semantic_match: Allows different wording with same meaning
  • flexible: Allows variation in both wording and meaning within requirements
deterministic = <TaskDeterminism.deterministic: 'deterministic'>
semantic_match = <TaskDeterminism.semantic_match: 'semantic_match'>
flexible = <TaskDeterminism.flexible: 'flexible'>
class DatasetSplitDefinition(pydantic.main.BaseModel):
559class DatasetSplitDefinition(BaseModel):
560    """
561    A definition of a split in a dataset.
562
563    Example: name="train", description="The training set", percentage=0.8 (80% of the dataset)
564    """
565
566    name: str = NAME_FIELD
567    description: str | None = Field(
568        default=None,
569        description="A description of the dataset for you and your team. Not used in training.",
570    )
571    percentage: float = Field(
572        ge=0.0,
573        le=1.0,
574        description="The percentage of the dataset that this split represents (between 0 and 1).",
575    )

A definition of a split in a dataset.

Example: name="train", description="The training set", percentage=0.8 (80% of the dataset)

name: str
description: str | None
percentage: float
model_config: ClassVar[pydantic.config.ConfigDict] = {}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

class DatasetSplit(kiln_ai.datamodel.basemodel.KilnParentedModel):
592class DatasetSplit(KilnParentedModel):
593    """
594    A collection of task runs, with optional splits (train, test, validation).
595
596    Used to freeze a dataset into train/test/validation splits for repeatable fine-tuning or other tasks.
597
598    Maintains a list of IDs for each split, to avoid data duplication.
599    """
600
601    name: str = NAME_FIELD
602    description: str | None = Field(
603        default=None,
604        description="A description of the dataset for you and your team. Not used in training.",
605    )
606    splits: list[DatasetSplitDefinition] = Field(
607        default_factory=list,
608        description="The splits in the dataset.",
609    )
610    split_contents: dict[str, list[str]] = Field(
611        description="The contents of each split in the dataset. The key is the split name, and the value is a list of task run IDs.",
612    )
613
614    @model_validator(mode="after")
615    def validate_split_percentages(self) -> "DatasetSplit":
616        total = sum(split.percentage for split in self.splits)
617        if not math.isclose(total, 1.0, rel_tol=1e-9):
618            raise ValueError(f"The sum of split percentages must be 1.0 (got {total})")
619        return self
620
621    @classmethod
622    def from_task(
623        cls,
624        name: str,
625        task: "Task",
626        splits: list[DatasetSplitDefinition],
627        filter: DatasetFilter = AllDatasetFilter,
628        description: str | None = None,
629    ):
630        """
631        Build a dataset split from a task.
632        """
633        split_contents = cls.build_split_contents(task, splits, filter)
634        return cls(
635            parent=task,
636            name=name,
637            description=description,
638            splits=splits,
639            split_contents=split_contents,
640        )
641
642    @classmethod
643    def build_split_contents(
644        cls,
645        task: "Task",
646        splits: list[DatasetSplitDefinition],
647        filter: DatasetFilter,
648    ) -> dict[str, list[str]]:
649        valid_ids = []
650        for task_run in task.runs():
651            if filter(task_run):
652                valid_ids.append(task_run.id)
653
654        # Shuffle and split by split percentage
655        random.shuffle(valid_ids)
656        split_contents = {}
657        start_idx = 0
658        remaining_items = len(valid_ids)
659
660        # Handle all splits except the last one
661        for split in splits[:-1]:
662            split_size = round(len(valid_ids) * split.percentage)
663            split_contents[split.name] = valid_ids[start_idx : start_idx + split_size]
664            start_idx += split_size
665            remaining_items -= split_size
666
667        # Last split gets all remaining items (for rounding)
668        if splits:
669            split_contents[splits[-1].name] = valid_ids[start_idx:]
670
671        return split_contents
672
673    def parent_task(self) -> "Task | None":
674        # inline import to avoid circular import
675        from kiln_ai.datamodel import Task
676
677        if not isinstance(self.parent, Task):
678            return None
679        return self.parent
680
681    def missing_count(self) -> int:
682        """
683        Returns:
684            int: the number of task runs that have an ID persisted in this dataset split, but no longer exist in the dataset
685        """
686        parent = self.parent_task()
687        if parent is None:
688            raise ValueError("DatasetSplit has no parent task")
689
690        runs = parent.runs()
691        all_ids = set(run.id for run in runs)
692        all_ids_in_splits = set()
693        for ids in self.split_contents.values():
694            all_ids_in_splits.update(ids)
695        missing = all_ids_in_splits - all_ids
696        return len(missing)

A collection of task runs, with optional splits (train, test, validation).

Used to freeze a dataset into train/test/validation splits for repeatable fine-tuning or other tasks.

Maintains a list of IDs for each split, to avoid data duplication.

name: str
description: str | None
splits: list[DatasetSplitDefinition]
split_contents: dict[str, list[str]]
@model_validator(mode='after')
def validate_split_percentages(self) -> DatasetSplit:
614    @model_validator(mode="after")
615    def validate_split_percentages(self) -> "DatasetSplit":
616        total = sum(split.percentage for split in self.splits)
617        if not math.isclose(total, 1.0, rel_tol=1e-9):
618            raise ValueError(f"The sum of split percentages must be 1.0 (got {total})")
619        return self
@classmethod
def from_task( cls, name: str, task: Task, splits: list[DatasetSplitDefinition], filter: Callable[[TaskRun], bool] = <function AllDatasetFilter>, description: str | None = None):
621    @classmethod
622    def from_task(
623        cls,
624        name: str,
625        task: "Task",
626        splits: list[DatasetSplitDefinition],
627        filter: DatasetFilter = AllDatasetFilter,
628        description: str | None = None,
629    ):
630        """
631        Build a dataset split from a task.
632        """
633        split_contents = cls.build_split_contents(task, splits, filter)
634        return cls(
635            parent=task,
636            name=name,
637            description=description,
638            splits=splits,
639            split_contents=split_contents,
640        )

Build a dataset split from a task.

@classmethod
def build_split_contents( cls, task: Task, splits: list[DatasetSplitDefinition], filter: Callable[[TaskRun], bool]) -> dict[str, list[str]]:
642    @classmethod
643    def build_split_contents(
644        cls,
645        task: "Task",
646        splits: list[DatasetSplitDefinition],
647        filter: DatasetFilter,
648    ) -> dict[str, list[str]]:
649        valid_ids = []
650        for task_run in task.runs():
651            if filter(task_run):
652                valid_ids.append(task_run.id)
653
654        # Shuffle and split by split percentage
655        random.shuffle(valid_ids)
656        split_contents = {}
657        start_idx = 0
658        remaining_items = len(valid_ids)
659
660        # Handle all splits except the last one
661        for split in splits[:-1]:
662            split_size = round(len(valid_ids) * split.percentage)
663            split_contents[split.name] = valid_ids[start_idx : start_idx + split_size]
664            start_idx += split_size
665            remaining_items -= split_size
666
667        # Last split gets all remaining items (for rounding)
668        if splits:
669            split_contents[splits[-1].name] = valid_ids[start_idx:]
670
671        return split_contents
def parent_task(self) -> Task | None:
673    def parent_task(self) -> "Task | None":
674        # inline import to avoid circular import
675        from kiln_ai.datamodel import Task
676
677        if not isinstance(self.parent, Task):
678            return None
679        return self.parent
def missing_count(self) -> int:
681    def missing_count(self) -> int:
682        """
683        Returns:
684            int: the number of task runs that have an ID persisted in this dataset split, but no longer exist in the dataset
685        """
686        parent = self.parent_task()
687        if parent is None:
688            raise ValueError("DatasetSplit has no parent task")
689
690        runs = parent.runs()
691        all_ids = set(run.id for run in runs)
692        all_ids_in_splits = set()
693        for ids in self.split_contents.values():
694            all_ids_in_splits.update(ids)
695        missing = all_ids_in_splits - all_ids
696        return len(missing)

Returns: int: the number of task runs that have an ID persisted in this dataset split, but no longer exist in the dataset

def relationship_name() -> str:
416        def relationship_name_method() -> str:
417            return relationship_name
def parent_type() -> Type[kiln_ai.datamodel.basemodel.KilnParentModel]:
409        def parent_class_method() -> Type[KilnParentModel]:
410            return cls
model_config = {'validate_assignment': True}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

def model_post_init(self: pydantic.main.BaseModel, context: Any, /) -> None:
122                    def wrapped_model_post_init(self: BaseModel, context: Any, /) -> None:
123                        """We need to both initialize private attributes and call the user-defined model_post_init
124                        method.
125                        """
126                        init_private_attributes(self, context)
127                        original_model_post_init(self, context)

We need to both initialize private attributes and call the user-defined model_post_init method.

class RequirementRating(pydantic.main.BaseModel):
100class RequirementRating(BaseModel):
101    """Rating for a specific requirement within a task output."""
102
103    value: float = Field(
104        description="The rating value. Interpretation depends on rating type"
105    )
106    type: TaskOutputRatingType = Field(description="The type of rating")

Rating for a specific requirement within a task output.

value: float
model_config: ClassVar[pydantic.config.ConfigDict] = {}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

def strict_mode() -> bool:
72def strict_mode() -> bool:
73    return _strict_mode
def set_strict_mode(value: bool) -> None:
76def set_strict_mode(value: bool) -> None:
77    global _strict_mode
78    _strict_mode = value