kiln_ai.datamodel
See our docs for details about our datamodel: https://kiln-ai.github.io/Kiln/kiln_core_docs/kiln_ai.html
1""" 2See our docs for details about our datamodel: https://kiln-ai.github.io/Kiln/kiln_core_docs/kiln_ai.html 3""" 4 5from __future__ import annotations 6 7import json 8import math 9import random 10from enum import Enum, IntEnum 11from typing import TYPE_CHECKING, Callable, Dict, List, Type, Union 12 13import jsonschema 14import jsonschema.exceptions 15from pydantic import ( 16 BaseModel, 17 Field, 18 ValidationInfo, 19 model_validator, 20) 21from typing_extensions import Self 22 23from kiln_ai.datamodel.json_schema import JsonObjectSchema, schema_from_json_str 24 25from .basemodel import ( 26 ID_FIELD, 27 ID_TYPE, 28 NAME_FIELD, 29 SHORT_NAME_FIELD, 30 KilnBaseModel, 31 KilnParentedModel, 32 KilnParentModel, 33) 34from .json_schema import validate_schema 35 36if TYPE_CHECKING: 37 from . import Task 38 39 40__all__ = [ 41 "basemodel", 42 "json_schema", 43 "Task", 44 "Project", 45 "TaskRun", 46 "TaskOutput", 47 "TaskOutputRating", 48 "Priority", 49 "DataSource", 50 "DataSourceType", 51 "DataSourceProperty", 52 "Finetune", 53 "FineTuneStatusType", 54 "TaskOutputRatingType", 55 "TaskRequirement", 56 "TaskDeterminism", 57 "DatasetSplitDefinition", 58 "DatasetSplit", 59 "RequirementRating", 60 "TaskRequirement", 61 "strict_mode", 62 "set_strict_mode", 63] 64 65 66# We want to be hard on ourselves for data completeness generated by the Kiln App, but don't want to make it hard for users to use the datamodel/library. 67# Strict mode enables extra validations that we want to enforce in Kiln App (and any other client that wants best practices), but not in the library (unless they opt in) 68_strict_mode: bool = False 69 70 71def strict_mode() -> bool: 72 return _strict_mode 73 74 75def set_strict_mode(value: bool) -> None: 76 global _strict_mode 77 _strict_mode = value 78 79 80class Priority(IntEnum): 81 """Defines priority levels for tasks and requirements, where P0 is highest priority.""" 82 83 p0 = 0 84 p1 = 1 85 p2 = 2 86 p3 = 3 87 88 89# Only one rating type for now, but this allows for extensibility if we want to add more in the future 90class TaskOutputRatingType(str, Enum): 91 """Defines the types of rating systems available for task outputs.""" 92 93 five_star = "five_star" 94 pass_fail = "pass_fail" 95 pass_fail_critical = "pass_fail_critical" 96 custom = "custom" 97 98 99class RequirementRating(BaseModel): 100 """Rating for a specific requirement within a task output.""" 101 102 value: float = Field( 103 description="The rating value. Interpretation depends on rating type" 104 ) 105 type: TaskOutputRatingType = Field(description="The type of rating") 106 107 108class TaskOutputRating(KilnBaseModel): 109 """ 110 A rating for a task output, including an overall rating and ratings for each requirement. 111 112 Supports: 113 - five_star: 1-5 star ratings 114 - pass_fail: boolean pass/fail (1.0 = pass, 0.0 = fail) 115 - pass_fail_critical: tri-state (1.0 = pass, 0.0 = fail, -1.0 = critical fail) 116 """ 117 118 type: TaskOutputRatingType = Field(default=TaskOutputRatingType.five_star) 119 value: float | None = Field( 120 description="The rating value. Interpretation depends on rating type:\n- five_star: 1-5 stars\n- pass_fail: 1.0 (pass) or 0.0 (fail)\n- pass_fail_critical: 1.0 (pass), 0.0 (fail), or -1.0 (critical fail)", 121 default=None, 122 ) 123 requirement_ratings: Dict[ID_TYPE, RequirementRating] = Field( 124 default={}, 125 description="The ratings of the requirements of the task.", 126 ) 127 128 # Previously we stored rating values as a dict of floats, but now we store them as RequirementRating objects. 129 @model_validator(mode="before") 130 def upgrade_old_format(cls, data: dict) -> dict: 131 if not isinstance(data, dict): 132 return data 133 134 # Check if we have the old format (dict of floats) 135 req_ratings = data.get("requirement_ratings", {}) 136 if req_ratings and all( 137 isinstance(v, (int, float)) for v in req_ratings.values() 138 ): 139 # Convert each float to a RequirementRating object 140 # all ratings are five star at the point we used this format 141 data["requirement_ratings"] = { 142 k: {"value": v, "type": TaskOutputRatingType.five_star} 143 for k, v in req_ratings.items() 144 } 145 146 return data 147 148 # Used to select high quality outputs for example selection (MultiShotPromptBuilder, etc) 149 def is_high_quality(self) -> bool: 150 if self.value is None: 151 return False 152 153 if self.type == TaskOutputRatingType.five_star: 154 return self.value >= 4 155 elif self.type == TaskOutputRatingType.pass_fail: 156 return self.value == 1.0 157 elif self.type == TaskOutputRatingType.pass_fail_critical: 158 return self.value == 1.0 159 return False 160 161 @model_validator(mode="after") 162 def validate_rating(self) -> Self: 163 if self.type not in TaskOutputRatingType: 164 raise ValueError(f"Invalid rating type: {self.type}") 165 166 # Overall rating is optional 167 if self.value is not None: 168 self._validate_rating(self.type, self.value, "overall rating") 169 170 for req_id, req_rating in self.requirement_ratings.items(): 171 self._validate_rating( 172 req_rating.type, 173 req_rating.value, 174 f"requirement rating for req ID: {req_id}", 175 ) 176 177 return self 178 179 def _validate_rating( 180 self, type: TaskOutputRatingType, rating: float | None, rating_name: str 181 ) -> None: 182 if type == TaskOutputRatingType.five_star: 183 self._validate_five_star(rating, rating_name) 184 elif type == TaskOutputRatingType.pass_fail: 185 self._validate_pass_fail(rating, rating_name) 186 elif type == TaskOutputRatingType.pass_fail_critical: 187 self._validate_pass_fail_critical(rating, rating_name) 188 189 def _validate_five_star(self, rating: float | None, rating_name: str) -> None: 190 if rating is None or not isinstance(rating, float) or not rating.is_integer(): 191 raise ValueError( 192 f"{rating_name.capitalize()} of type five_star must be an integer value (1-5)" 193 ) 194 if rating < 1 or rating > 5: 195 raise ValueError( 196 f"{rating_name.capitalize()} of type five_star must be between 1 and 5 stars" 197 ) 198 199 def _validate_pass_fail(self, rating: float | None, rating_name: str) -> None: 200 if rating is None or not isinstance(rating, float) or not rating.is_integer(): 201 raise ValueError( 202 f"{rating_name.capitalize()} of type pass_fail must be an integer value (0 or 1)" 203 ) 204 if rating not in [0, 1]: 205 raise ValueError( 206 f"{rating_name.capitalize()} of type pass_fail must be 0 (fail) or 1 (pass)" 207 ) 208 209 def _validate_pass_fail_critical( 210 self, rating: float | None, rating_name: str 211 ) -> None: 212 if rating is None or not isinstance(rating, float) or not rating.is_integer(): 213 raise ValueError( 214 f"{rating_name.capitalize()} of type pass_fail_critical must be an integer value (-1, 0, or 1)" 215 ) 216 if rating not in [-1, 0, 1]: 217 raise ValueError( 218 f"{rating_name.capitalize()} of type pass_fail_critical must be -1 (critical fail), 0 (fail), or 1 (pass)" 219 ) 220 221 222class TaskOutput(KilnBaseModel): 223 """ 224 An output for a specific task run. 225 226 Contains the actual output content, its source (human or synthetic), 227 and optional rating information. 228 """ 229 230 output: str = Field( 231 description="The output of the task. JSON formatted for structured output, plaintext for unstructured output." 232 ) 233 source: DataSource | None = Field( 234 description="The source of the output: human or synthetic.", 235 default=None, 236 ) 237 rating: TaskOutputRating | None = Field( 238 default=None, description="The rating of the output" 239 ) 240 241 def validate_output_format(self, task: Task) -> Self: 242 # validate output 243 if task.output_json_schema is not None: 244 try: 245 validate_schema(json.loads(self.output), task.output_json_schema) 246 except json.JSONDecodeError: 247 raise ValueError("Output is not a valid JSON object") 248 except jsonschema.exceptions.ValidationError as e: 249 raise ValueError(f"Output does not match task output schema: {e}") 250 return self 251 252 @model_validator(mode="after") 253 def validate_output_source(self, info: ValidationInfo) -> Self: 254 # On strict mode and not loaded from file, we validate output_source is not None. 255 # We want to be able to load any data, even if it's not perfect. But we want to create perfect data when adding new data. 256 if not strict_mode(): 257 return self 258 if self.loaded_from_file(info): 259 return self 260 if self.source is None: 261 raise ValueError("Output source is required when strict mode is enabled") 262 return self 263 264 265class FineTuneStatusType(str, Enum): 266 """ 267 The status type of a fine-tune (running, completed, failed, etc). 268 """ 269 270 unknown = "unknown" # server error 271 pending = "pending" 272 running = "running" 273 completed = "completed" 274 failed = "failed" 275 276 277class Finetune(KilnParentedModel): 278 name: str = NAME_FIELD 279 description: str | None = Field( 280 default=None, 281 description="A description of the fine-tune for you and your team. Not used in training.", 282 ) 283 provider: str = Field( 284 description="The provider to use for the fine-tune (e.g. 'openai')." 285 ) 286 base_model_id: str = Field( 287 description="The id of the base model to use for the fine-tune. This string relates to the provider's IDs for their own models, not Kiln IDs." 288 ) 289 provider_id: str | None = Field( 290 default=None, 291 description="The ID of the fine-tune job on the provider's side. May not be the same as the fine_tune_model_id.", 292 ) 293 fine_tune_model_id: str | None = Field( 294 default=None, 295 description="The ID of the fine-tuned model on the provider's side. May not be the same as the provider_id.", 296 ) 297 dataset_split_id: str = Field( 298 description="The ID of the dataset split to use for this fine-tune.", 299 ) 300 train_split_name: str = Field( 301 default="train", 302 description="The name of the training split to use for this fine-tune.", 303 ) 304 validation_split_name: str | None = Field( 305 default=None, 306 description="The name of the validation split to use for this fine-tune. Optional.", 307 ) 308 parameters: dict[str, str | int | float | bool] = Field( 309 default={}, 310 description="The parameters to use for this fine-tune. These are provider-specific.", 311 ) 312 system_message: str = Field( 313 description="The system message to use for this fine-tune.", 314 ) 315 latest_status: FineTuneStatusType = Field( 316 default=FineTuneStatusType.unknown, 317 description="The latest known status of this fine-tune. Not updated in real time.", 318 ) 319 properties: Dict[str, str | int | float] = Field( 320 default={}, 321 description="Properties of the fine-tune. Different providers may use different properties.", 322 ) 323 324 def parent_task(self) -> Task | None: 325 if not isinstance(self.parent, Task): 326 return None 327 return self.parent 328 329 330class DataSourceType(str, Enum): 331 """ 332 The source type of a piece of data. 333 334 Human: a human created the data 335 Synthetic: a model created the data 336 """ 337 338 human = "human" 339 synthetic = "synthetic" 340 341 342class DataSourceProperty(BaseModel): 343 """ 344 Defines a property that can be associated with a data source. 345 346 Includes validation rules for when properties are required or not allowed 347 based on the data source type. 348 """ 349 350 name: str 351 type: Type[Union[str, int, float]] 352 required_for: List[DataSourceType] = [] 353 not_allowed_for: List[DataSourceType] = [] 354 355 356class DataSource(BaseModel): 357 """ 358 Represents the origin of data, either human or synthetic, with associated properties. 359 360 Properties vary based on the source type - for synthetic sources this includes 361 model information, for human sources this includes creator information. 362 """ 363 364 type: DataSourceType 365 properties: Dict[str, str | int | float] = Field( 366 default={}, 367 description="Properties describing the data source. For synthetic things like model. For human, the human's name.", 368 ) 369 370 _data_source_properties = [ 371 DataSourceProperty( 372 name="created_by", 373 type=str, 374 required_for=[DataSourceType.human], 375 not_allowed_for=[DataSourceType.synthetic], 376 ), 377 DataSourceProperty( 378 name="model_name", 379 type=str, 380 required_for=[DataSourceType.synthetic], 381 not_allowed_for=[DataSourceType.human], 382 ), 383 DataSourceProperty( 384 name="model_provider", 385 type=str, 386 required_for=[DataSourceType.synthetic], 387 not_allowed_for=[DataSourceType.human], 388 ), 389 DataSourceProperty( 390 name="adapter_name", 391 type=str, 392 required_for=[DataSourceType.synthetic], 393 not_allowed_for=[DataSourceType.human], 394 ), 395 DataSourceProperty( 396 name="prompt_builder_name", 397 type=str, 398 not_allowed_for=[DataSourceType.human], 399 ), 400 ] 401 402 @model_validator(mode="after") 403 def validate_type(self) -> "DataSource": 404 if self.type not in DataSourceType: 405 raise ValueError(f"Invalid data source type: {self.type}") 406 return self 407 408 @model_validator(mode="after") 409 def validate_properties(self) -> "DataSource": 410 for prop in self._data_source_properties: 411 # Check the property type is correct 412 if prop.name in self.properties: 413 if not isinstance(self.properties[prop.name], prop.type): 414 raise ValueError( 415 f"'{prop.name}' must be of type {prop.type.__name__} for {self.type} data source" 416 ) 417 # Check the property is required for the data source type 418 if self.type in prop.required_for: 419 if prop.name not in self.properties: 420 raise ValueError( 421 f"'{prop.name}' is required for {self.type} data source" 422 ) 423 # Check the property is not allowed for the data source type 424 elif self.type in prop.not_allowed_for and prop.name in self.properties: 425 raise ValueError( 426 f"'{prop.name}' is not allowed for {self.type} data source" 427 ) 428 return self 429 430 @model_validator(mode="after") 431 def validate_no_empty_properties(self) -> Self: 432 for prop, value in self.properties.items(): 433 if isinstance(value, str) and value == "": 434 raise ValueError( 435 f"Property '{prop}' must be a non-empty string for {self.type} data source" 436 ) 437 return self 438 439 440class TaskRun(KilnParentedModel): 441 """ 442 Represents a single execution of a Task. 443 444 Contains the input used, its source, the output produced, and optional 445 repair information if the output needed correction. 446 """ 447 448 input: str = Field( 449 description="The inputs to the task. JSON formatted for structured input, plaintext for unstructured input." 450 ) 451 input_source: DataSource | None = Field( 452 default=None, description="The source of the input: human or synthetic." 453 ) 454 455 output: TaskOutput = Field(description="The output of the task run.") 456 repair_instructions: str | None = Field( 457 default=None, 458 description="Instructions for fixing the output. Should define what is wrong, and how to fix it. Will be used by models for both generating a fixed output, and evaluating future models.", 459 ) 460 repaired_output: TaskOutput | None = Field( 461 default=None, 462 description="An version of the output with issues fixed. This must be a 'fixed' version of the existing output, and not an entirely new output. If you wish to generate an ideal curatorial output for this task unrelated to this output, generate a new TaskOutput with type 'human' instead of using this field.", 463 ) 464 intermediate_outputs: Dict[str, str] | None = Field( 465 default=None, 466 description="Intermediate outputs from the task run. Keys are the names of the intermediate output steps (cot=chain of thought, etc), values are the output data.", 467 ) 468 tags: List[str] = Field( 469 default=[], 470 description="Tags for the task run. Tags are used to categorize task runs for filtering and reporting.", 471 ) 472 473 def parent_task(self) -> Task | None: 474 if not isinstance(self.parent, Task): 475 return None 476 return self.parent 477 478 @model_validator(mode="after") 479 def validate_input_format(self) -> Self: 480 task = self.parent_task() 481 if task is None: 482 # don't validate this relationship until we have a path or parent. Give them time to build it (but will catch it before saving) 483 return self 484 485 # validate output 486 if task.input_json_schema is not None: 487 try: 488 validate_schema(json.loads(self.input), task.input_json_schema) 489 except json.JSONDecodeError: 490 raise ValueError("Input is not a valid JSON object") 491 except jsonschema.exceptions.ValidationError as e: 492 raise ValueError(f"Input does not match task input schema: {e}") 493 return self 494 495 @model_validator(mode="after") 496 def validate_output_format(self) -> Self: 497 task = self.parent_task() 498 if task is None: 499 return self 500 501 self.output.validate_output_format(task) 502 return self 503 504 @model_validator(mode="after") 505 def validate_repaired_output(self) -> Self: 506 if self.repaired_output is not None: 507 if self.repaired_output.rating is not None: 508 raise ValueError( 509 "Repaired output rating must be None. Repaired outputs are assumed to have a perfect rating, as they have been fixed." 510 ) 511 if self.repair_instructions is None and self.repaired_output is not None: 512 raise ValueError( 513 "Repair instructions are required if providing a repaired output." 514 ) 515 if self.repair_instructions is not None and self.repaired_output is None: 516 raise ValueError( 517 "A repaired output is required if providing repair instructions." 518 ) 519 return self 520 521 @model_validator(mode="after") 522 def validate_input_source(self, info: ValidationInfo) -> Self: 523 # On strict mode and not loaded from file, we validate input_source is not None. 524 # We want to be able to load any data, even if it's not perfect. But we want to create perfect data when adding new data. 525 if not strict_mode(): 526 return self 527 if self.loaded_from_file(info): 528 return self 529 if self.input_source is None: 530 raise ValueError("input_source is required when strict mode is enabled") 531 return self 532 533 @model_validator(mode="after") 534 def validate_tags(self) -> Self: 535 for tag in self.tags: 536 if not tag: 537 raise ValueError("Tags cannot be empty strings") 538 if " " in tag: 539 raise ValueError("Tags cannot contain spaces. Try underscores.") 540 541 return self 542 543 544# Define the type alias for clarity 545DatasetFilter = Callable[[TaskRun], bool] 546 547 548def AllDatasetFilter(_: TaskRun) -> bool: 549 return True 550 551 552def HighRatingDatasetFilter(task_run: TaskRun) -> bool: 553 if task_run.output is None or task_run.output.rating is None: 554 return False 555 return task_run.output.rating.is_high_quality() 556 557 558class DatasetSplitDefinition(BaseModel): 559 """ 560 A definition of a split in a dataset. 561 562 Example: name="train", description="The training set", percentage=0.8 (80% of the dataset) 563 """ 564 565 name: str = NAME_FIELD 566 description: str | None = Field( 567 default=None, 568 description="A description of the dataset for you and your team. Not used in training.", 569 ) 570 percentage: float = Field( 571 ge=0.0, 572 le=1.0, 573 description="The percentage of the dataset that this split represents (between 0 and 1).", 574 ) 575 576 577AllSplitDefinition: list[DatasetSplitDefinition] = [ 578 DatasetSplitDefinition(name="all", percentage=1.0) 579] 580Train80Test20SplitDefinition: list[DatasetSplitDefinition] = [ 581 DatasetSplitDefinition(name="train", percentage=0.8), 582 DatasetSplitDefinition(name="test", percentage=0.2), 583] 584Train60Test20Val20SplitDefinition: list[DatasetSplitDefinition] = [ 585 DatasetSplitDefinition(name="train", percentage=0.6), 586 DatasetSplitDefinition(name="test", percentage=0.2), 587 DatasetSplitDefinition(name="val", percentage=0.2), 588] 589 590 591class DatasetSplit(KilnParentedModel): 592 """ 593 A collection of task runs, with optional splits (train, test, validation). 594 595 Used to freeze a dataset into train/test/validation splits for repeatable fine-tuning or other tasks. 596 597 Maintains a list of IDs for each split, to avoid data duplication. 598 """ 599 600 name: str = NAME_FIELD 601 description: str | None = Field( 602 default=None, 603 description="A description of the dataset for you and your team. Not used in training.", 604 ) 605 splits: list[DatasetSplitDefinition] = Field( 606 default_factory=list, 607 description="The splits in the dataset.", 608 ) 609 split_contents: dict[str, list[str]] = Field( 610 description="The contents of each split in the dataset. The key is the split name, and the value is a list of task run IDs.", 611 ) 612 613 @model_validator(mode="after") 614 def validate_split_percentages(self) -> "DatasetSplit": 615 total = sum(split.percentage for split in self.splits) 616 if not math.isclose(total, 1.0, rel_tol=1e-9): 617 raise ValueError(f"The sum of split percentages must be 1.0 (got {total})") 618 return self 619 620 @classmethod 621 def from_task( 622 cls, 623 name: str, 624 task: "Task", 625 splits: list[DatasetSplitDefinition], 626 filter: DatasetFilter = AllDatasetFilter, 627 description: str | None = None, 628 ): 629 """ 630 Build a dataset split from a task. 631 """ 632 split_contents = cls.build_split_contents(task, splits, filter) 633 return cls( 634 parent=task, 635 name=name, 636 description=description, 637 splits=splits, 638 split_contents=split_contents, 639 ) 640 641 @classmethod 642 def build_split_contents( 643 cls, 644 task: "Task", 645 splits: list[DatasetSplitDefinition], 646 filter: DatasetFilter, 647 ) -> dict[str, list[str]]: 648 valid_ids = [] 649 for task_run in task.runs(): 650 if filter(task_run): 651 valid_ids.append(task_run.id) 652 653 # Shuffle and split by split percentage 654 random.shuffle(valid_ids) 655 split_contents = {} 656 start_idx = 0 657 remaining_items = len(valid_ids) 658 659 # Handle all splits except the last one 660 for split in splits[:-1]: 661 split_size = round(len(valid_ids) * split.percentage) 662 split_contents[split.name] = valid_ids[start_idx : start_idx + split_size] 663 start_idx += split_size 664 remaining_items -= split_size 665 666 # Last split gets all remaining items (for rounding) 667 if splits: 668 split_contents[splits[-1].name] = valid_ids[start_idx:] 669 670 return split_contents 671 672 def parent_task(self) -> "Task | None": 673 # inline import to avoid circular import 674 from kiln_ai.datamodel import Task 675 676 if not isinstance(self.parent, Task): 677 return None 678 return self.parent 679 680 def missing_count(self) -> int: 681 """ 682 Returns: 683 int: the number of task runs that have an ID persisted in this dataset split, but no longer exist in the dataset 684 """ 685 parent = self.parent_task() 686 if parent is None: 687 raise ValueError("DatasetSplit has no parent task") 688 689 runs = parent.runs() 690 all_ids = set(run.id for run in runs) 691 all_ids_in_splits = set() 692 for ids in self.split_contents.values(): 693 all_ids_in_splits.update(ids) 694 missing = all_ids_in_splits - all_ids 695 return len(missing) 696 697 698class TaskRequirement(BaseModel): 699 """ 700 Defines a specific requirement that should be met by task outputs. 701 702 Includes an identifier, name, description, instruction for meeting the requirement, 703 priority level, and rating type (five_star, pass_fail, pass_fail_critical, custom). 704 """ 705 706 id: ID_TYPE = ID_FIELD 707 name: str = SHORT_NAME_FIELD 708 description: str | None = Field(default=None) 709 instruction: str = Field(min_length=1) 710 priority: Priority = Field(default=Priority.p2) 711 type: TaskOutputRatingType = Field(default=TaskOutputRatingType.five_star) 712 713 714class TaskDeterminism(str, Enum): 715 """ 716 Defines how strictly task outputs should match expected results. 717 718 - deterministic: Requires exact matches 719 - semantic_match: Allows different wording with same meaning 720 - flexible: Allows variation in both wording and meaning within requirements 721 """ 722 723 deterministic = "deterministic" # Expect exact match 724 semantic_match = "semantic_match" # Expect same meaning, but flexible on expression of the meaning 725 flexible = "flexible" # Flexible on semantic output. Eval should be custom based on parsing requirements. 726 727 728class Task( 729 KilnParentedModel, 730 KilnParentModel, 731 parent_of={ 732 "runs": TaskRun, 733 "dataset_splits": DatasetSplit, 734 "finetunes": Finetune, 735 }, 736): 737 """ 738 Represents a specific task to be performed, with associated requirements and validation rules. 739 740 Contains the task definition, requirements, input/output schemas, and maintains 741 a collection of task runs. 742 """ 743 744 name: str = NAME_FIELD 745 description: str | None = Field( 746 default=None, 747 description="A description of the task for you and your team. Will not be used in prompts/training/validation.", 748 ) 749 instruction: str = Field( 750 min_length=1, 751 description="The instructions for the task. Will be used in prompts/training/validation.", 752 ) 753 requirements: List[TaskRequirement] = Field(default=[]) 754 output_json_schema: JsonObjectSchema | None = None 755 input_json_schema: JsonObjectSchema | None = None 756 thinking_instruction: str | None = Field( 757 default=None, 758 description="Instructions for the model 'thinking' about the requirement prior to answering. Used for chain of thought style prompting.", 759 ) 760 761 def output_schema(self) -> Dict | None: 762 if self.output_json_schema is None: 763 return None 764 return schema_from_json_str(self.output_json_schema) 765 766 def input_schema(self) -> Dict | None: 767 if self.input_json_schema is None: 768 return None 769 return schema_from_json_str(self.input_json_schema) 770 771 # Needed for typechecking. TODO P2: fix this in KilnParentModel 772 def runs(self) -> list[TaskRun]: 773 return super().runs() # type: ignore 774 775 def dataset_splits(self) -> list[DatasetSplit]: 776 return super().dataset_splits() # type: ignore 777 778 def finetunes(self) -> list[Finetune]: 779 return super().finetunes() # type: ignore 780 781 782class Project(KilnParentModel, parent_of={"tasks": Task}): 783 """ 784 A collection of related tasks. 785 786 Projects organize tasks into logical groups and provide high-level descriptions 787 of the overall goals. 788 """ 789 790 name: str = NAME_FIELD 791 description: str | None = Field( 792 default=None, 793 description="A description of the project for you and your team. Will not be used in prompts/training/validation.", 794 ) 795 796 # Needed for typechecking. TODO P2: fix this in KilnParentModel 797 def tasks(self) -> list[Task]: 798 return super().tasks() # type: ignore
729class Task( 730 KilnParentedModel, 731 KilnParentModel, 732 parent_of={ 733 "runs": TaskRun, 734 "dataset_splits": DatasetSplit, 735 "finetunes": Finetune, 736 }, 737): 738 """ 739 Represents a specific task to be performed, with associated requirements and validation rules. 740 741 Contains the task definition, requirements, input/output schemas, and maintains 742 a collection of task runs. 743 """ 744 745 name: str = NAME_FIELD 746 description: str | None = Field( 747 default=None, 748 description="A description of the task for you and your team. Will not be used in prompts/training/validation.", 749 ) 750 instruction: str = Field( 751 min_length=1, 752 description="The instructions for the task. Will be used in prompts/training/validation.", 753 ) 754 requirements: List[TaskRequirement] = Field(default=[]) 755 output_json_schema: JsonObjectSchema | None = None 756 input_json_schema: JsonObjectSchema | None = None 757 thinking_instruction: str | None = Field( 758 default=None, 759 description="Instructions for the model 'thinking' about the requirement prior to answering. Used for chain of thought style prompting.", 760 ) 761 762 def output_schema(self) -> Dict | None: 763 if self.output_json_schema is None: 764 return None 765 return schema_from_json_str(self.output_json_schema) 766 767 def input_schema(self) -> Dict | None: 768 if self.input_json_schema is None: 769 return None 770 return schema_from_json_str(self.input_json_schema) 771 772 # Needed for typechecking. TODO P2: fix this in KilnParentModel 773 def runs(self) -> list[TaskRun]: 774 return super().runs() # type: ignore 775 776 def dataset_splits(self) -> list[DatasetSplit]: 777 return super().dataset_splits() # type: ignore 778 779 def finetunes(self) -> list[Finetune]: 780 return super().finetunes() # type: ignore
Represents a specific task to be performed, with associated requirements and validation rules.
Contains the task definition, requirements, input/output schemas, and maintains a collection of task runs.
Configuration for the model, should be a dictionary conforming to [ConfigDict
][pydantic.config.ConfigDict].
122 def wrapped_model_post_init(self: BaseModel, context: Any, /) -> None: 123 """We need to both initialize private attributes and call the user-defined model_post_init 124 method. 125 """ 126 init_private_attributes(self, context) 127 original_model_post_init(self, context)
We need to both initialize private attributes and call the user-defined model_post_init method.
Inherited Members
783class Project(KilnParentModel, parent_of={"tasks": Task}): 784 """ 785 A collection of related tasks. 786 787 Projects organize tasks into logical groups and provide high-level descriptions 788 of the overall goals. 789 """ 790 791 name: str = NAME_FIELD 792 description: str | None = Field( 793 default=None, 794 description="A description of the project for you and your team. Will not be used in prompts/training/validation.", 795 ) 796 797 # Needed for typechecking. TODO P2: fix this in KilnParentModel 798 def tasks(self) -> list[Task]: 799 return super().tasks() # type: ignore
A collection of related tasks.
Projects organize tasks into logical groups and provide high-level descriptions of the overall goals.
Configuration for the model, should be a dictionary conforming to [ConfigDict
][pydantic.config.ConfigDict].
122 def wrapped_model_post_init(self: BaseModel, context: Any, /) -> None: 123 """We need to both initialize private attributes and call the user-defined model_post_init 124 method. 125 """ 126 init_private_attributes(self, context) 127 original_model_post_init(self, context)
We need to both initialize private attributes and call the user-defined model_post_init method.
Inherited Members
441class TaskRun(KilnParentedModel): 442 """ 443 Represents a single execution of a Task. 444 445 Contains the input used, its source, the output produced, and optional 446 repair information if the output needed correction. 447 """ 448 449 input: str = Field( 450 description="The inputs to the task. JSON formatted for structured input, plaintext for unstructured input." 451 ) 452 input_source: DataSource | None = Field( 453 default=None, description="The source of the input: human or synthetic." 454 ) 455 456 output: TaskOutput = Field(description="The output of the task run.") 457 repair_instructions: str | None = Field( 458 default=None, 459 description="Instructions for fixing the output. Should define what is wrong, and how to fix it. Will be used by models for both generating a fixed output, and evaluating future models.", 460 ) 461 repaired_output: TaskOutput | None = Field( 462 default=None, 463 description="An version of the output with issues fixed. This must be a 'fixed' version of the existing output, and not an entirely new output. If you wish to generate an ideal curatorial output for this task unrelated to this output, generate a new TaskOutput with type 'human' instead of using this field.", 464 ) 465 intermediate_outputs: Dict[str, str] | None = Field( 466 default=None, 467 description="Intermediate outputs from the task run. Keys are the names of the intermediate output steps (cot=chain of thought, etc), values are the output data.", 468 ) 469 tags: List[str] = Field( 470 default=[], 471 description="Tags for the task run. Tags are used to categorize task runs for filtering and reporting.", 472 ) 473 474 def parent_task(self) -> Task | None: 475 if not isinstance(self.parent, Task): 476 return None 477 return self.parent 478 479 @model_validator(mode="after") 480 def validate_input_format(self) -> Self: 481 task = self.parent_task() 482 if task is None: 483 # don't validate this relationship until we have a path or parent. Give them time to build it (but will catch it before saving) 484 return self 485 486 # validate output 487 if task.input_json_schema is not None: 488 try: 489 validate_schema(json.loads(self.input), task.input_json_schema) 490 except json.JSONDecodeError: 491 raise ValueError("Input is not a valid JSON object") 492 except jsonschema.exceptions.ValidationError as e: 493 raise ValueError(f"Input does not match task input schema: {e}") 494 return self 495 496 @model_validator(mode="after") 497 def validate_output_format(self) -> Self: 498 task = self.parent_task() 499 if task is None: 500 return self 501 502 self.output.validate_output_format(task) 503 return self 504 505 @model_validator(mode="after") 506 def validate_repaired_output(self) -> Self: 507 if self.repaired_output is not None: 508 if self.repaired_output.rating is not None: 509 raise ValueError( 510 "Repaired output rating must be None. Repaired outputs are assumed to have a perfect rating, as they have been fixed." 511 ) 512 if self.repair_instructions is None and self.repaired_output is not None: 513 raise ValueError( 514 "Repair instructions are required if providing a repaired output." 515 ) 516 if self.repair_instructions is not None and self.repaired_output is None: 517 raise ValueError( 518 "A repaired output is required if providing repair instructions." 519 ) 520 return self 521 522 @model_validator(mode="after") 523 def validate_input_source(self, info: ValidationInfo) -> Self: 524 # On strict mode and not loaded from file, we validate input_source is not None. 525 # We want to be able to load any data, even if it's not perfect. But we want to create perfect data when adding new data. 526 if not strict_mode(): 527 return self 528 if self.loaded_from_file(info): 529 return self 530 if self.input_source is None: 531 raise ValueError("input_source is required when strict mode is enabled") 532 return self 533 534 @model_validator(mode="after") 535 def validate_tags(self) -> Self: 536 for tag in self.tags: 537 if not tag: 538 raise ValueError("Tags cannot be empty strings") 539 if " " in tag: 540 raise ValueError("Tags cannot contain spaces. Try underscores.") 541 542 return self
Represents a single execution of a Task.
Contains the input used, its source, the output produced, and optional repair information if the output needed correction.
479 @model_validator(mode="after") 480 def validate_input_format(self) -> Self: 481 task = self.parent_task() 482 if task is None: 483 # don't validate this relationship until we have a path or parent. Give them time to build it (but will catch it before saving) 484 return self 485 486 # validate output 487 if task.input_json_schema is not None: 488 try: 489 validate_schema(json.loads(self.input), task.input_json_schema) 490 except json.JSONDecodeError: 491 raise ValueError("Input is not a valid JSON object") 492 except jsonschema.exceptions.ValidationError as e: 493 raise ValueError(f"Input does not match task input schema: {e}") 494 return self
505 @model_validator(mode="after") 506 def validate_repaired_output(self) -> Self: 507 if self.repaired_output is not None: 508 if self.repaired_output.rating is not None: 509 raise ValueError( 510 "Repaired output rating must be None. Repaired outputs are assumed to have a perfect rating, as they have been fixed." 511 ) 512 if self.repair_instructions is None and self.repaired_output is not None: 513 raise ValueError( 514 "Repair instructions are required if providing a repaired output." 515 ) 516 if self.repair_instructions is not None and self.repaired_output is None: 517 raise ValueError( 518 "A repaired output is required if providing repair instructions." 519 ) 520 return self
522 @model_validator(mode="after") 523 def validate_input_source(self, info: ValidationInfo) -> Self: 524 # On strict mode and not loaded from file, we validate input_source is not None. 525 # We want to be able to load any data, even if it's not perfect. But we want to create perfect data when adding new data. 526 if not strict_mode(): 527 return self 528 if self.loaded_from_file(info): 529 return self 530 if self.input_source is None: 531 raise ValueError("input_source is required when strict mode is enabled") 532 return self
Configuration for the model, should be a dictionary conforming to [ConfigDict
][pydantic.config.ConfigDict].
122 def wrapped_model_post_init(self: BaseModel, context: Any, /) -> None: 123 """We need to both initialize private attributes and call the user-defined model_post_init 124 method. 125 """ 126 init_private_attributes(self, context) 127 original_model_post_init(self, context)
We need to both initialize private attributes and call the user-defined model_post_init method.
Inherited Members
223class TaskOutput(KilnBaseModel): 224 """ 225 An output for a specific task run. 226 227 Contains the actual output content, its source (human or synthetic), 228 and optional rating information. 229 """ 230 231 output: str = Field( 232 description="The output of the task. JSON formatted for structured output, plaintext for unstructured output." 233 ) 234 source: DataSource | None = Field( 235 description="The source of the output: human or synthetic.", 236 default=None, 237 ) 238 rating: TaskOutputRating | None = Field( 239 default=None, description="The rating of the output" 240 ) 241 242 def validate_output_format(self, task: Task) -> Self: 243 # validate output 244 if task.output_json_schema is not None: 245 try: 246 validate_schema(json.loads(self.output), task.output_json_schema) 247 except json.JSONDecodeError: 248 raise ValueError("Output is not a valid JSON object") 249 except jsonschema.exceptions.ValidationError as e: 250 raise ValueError(f"Output does not match task output schema: {e}") 251 return self 252 253 @model_validator(mode="after") 254 def validate_output_source(self, info: ValidationInfo) -> Self: 255 # On strict mode and not loaded from file, we validate output_source is not None. 256 # We want to be able to load any data, even if it's not perfect. But we want to create perfect data when adding new data. 257 if not strict_mode(): 258 return self 259 if self.loaded_from_file(info): 260 return self 261 if self.source is None: 262 raise ValueError("Output source is required when strict mode is enabled") 263 return self
An output for a specific task run.
Contains the actual output content, its source (human or synthetic), and optional rating information.
242 def validate_output_format(self, task: Task) -> Self: 243 # validate output 244 if task.output_json_schema is not None: 245 try: 246 validate_schema(json.loads(self.output), task.output_json_schema) 247 except json.JSONDecodeError: 248 raise ValueError("Output is not a valid JSON object") 249 except jsonschema.exceptions.ValidationError as e: 250 raise ValueError(f"Output does not match task output schema: {e}") 251 return self
253 @model_validator(mode="after") 254 def validate_output_source(self, info: ValidationInfo) -> Self: 255 # On strict mode and not loaded from file, we validate output_source is not None. 256 # We want to be able to load any data, even if it's not perfect. But we want to create perfect data when adding new data. 257 if not strict_mode(): 258 return self 259 if self.loaded_from_file(info): 260 return self 261 if self.source is None: 262 raise ValueError("Output source is required when strict mode is enabled") 263 return self
Configuration for the model, should be a dictionary conforming to [ConfigDict
][pydantic.config.ConfigDict].
122 def wrapped_model_post_init(self: BaseModel, context: Any, /) -> None: 123 """We need to both initialize private attributes and call the user-defined model_post_init 124 method. 125 """ 126 init_private_attributes(self, context) 127 original_model_post_init(self, context)
We need to both initialize private attributes and call the user-defined model_post_init method.
109class TaskOutputRating(KilnBaseModel): 110 """ 111 A rating for a task output, including an overall rating and ratings for each requirement. 112 113 Supports: 114 - five_star: 1-5 star ratings 115 - pass_fail: boolean pass/fail (1.0 = pass, 0.0 = fail) 116 - pass_fail_critical: tri-state (1.0 = pass, 0.0 = fail, -1.0 = critical fail) 117 """ 118 119 type: TaskOutputRatingType = Field(default=TaskOutputRatingType.five_star) 120 value: float | None = Field( 121 description="The rating value. Interpretation depends on rating type:\n- five_star: 1-5 stars\n- pass_fail: 1.0 (pass) or 0.0 (fail)\n- pass_fail_critical: 1.0 (pass), 0.0 (fail), or -1.0 (critical fail)", 122 default=None, 123 ) 124 requirement_ratings: Dict[ID_TYPE, RequirementRating] = Field( 125 default={}, 126 description="The ratings of the requirements of the task.", 127 ) 128 129 # Previously we stored rating values as a dict of floats, but now we store them as RequirementRating objects. 130 @model_validator(mode="before") 131 def upgrade_old_format(cls, data: dict) -> dict: 132 if not isinstance(data, dict): 133 return data 134 135 # Check if we have the old format (dict of floats) 136 req_ratings = data.get("requirement_ratings", {}) 137 if req_ratings and all( 138 isinstance(v, (int, float)) for v in req_ratings.values() 139 ): 140 # Convert each float to a RequirementRating object 141 # all ratings are five star at the point we used this format 142 data["requirement_ratings"] = { 143 k: {"value": v, "type": TaskOutputRatingType.five_star} 144 for k, v in req_ratings.items() 145 } 146 147 return data 148 149 # Used to select high quality outputs for example selection (MultiShotPromptBuilder, etc) 150 def is_high_quality(self) -> bool: 151 if self.value is None: 152 return False 153 154 if self.type == TaskOutputRatingType.five_star: 155 return self.value >= 4 156 elif self.type == TaskOutputRatingType.pass_fail: 157 return self.value == 1.0 158 elif self.type == TaskOutputRatingType.pass_fail_critical: 159 return self.value == 1.0 160 return False 161 162 @model_validator(mode="after") 163 def validate_rating(self) -> Self: 164 if self.type not in TaskOutputRatingType: 165 raise ValueError(f"Invalid rating type: {self.type}") 166 167 # Overall rating is optional 168 if self.value is not None: 169 self._validate_rating(self.type, self.value, "overall rating") 170 171 for req_id, req_rating in self.requirement_ratings.items(): 172 self._validate_rating( 173 req_rating.type, 174 req_rating.value, 175 f"requirement rating for req ID: {req_id}", 176 ) 177 178 return self 179 180 def _validate_rating( 181 self, type: TaskOutputRatingType, rating: float | None, rating_name: str 182 ) -> None: 183 if type == TaskOutputRatingType.five_star: 184 self._validate_five_star(rating, rating_name) 185 elif type == TaskOutputRatingType.pass_fail: 186 self._validate_pass_fail(rating, rating_name) 187 elif type == TaskOutputRatingType.pass_fail_critical: 188 self._validate_pass_fail_critical(rating, rating_name) 189 190 def _validate_five_star(self, rating: float | None, rating_name: str) -> None: 191 if rating is None or not isinstance(rating, float) or not rating.is_integer(): 192 raise ValueError( 193 f"{rating_name.capitalize()} of type five_star must be an integer value (1-5)" 194 ) 195 if rating < 1 or rating > 5: 196 raise ValueError( 197 f"{rating_name.capitalize()} of type five_star must be between 1 and 5 stars" 198 ) 199 200 def _validate_pass_fail(self, rating: float | None, rating_name: str) -> None: 201 if rating is None or not isinstance(rating, float) or not rating.is_integer(): 202 raise ValueError( 203 f"{rating_name.capitalize()} of type pass_fail must be an integer value (0 or 1)" 204 ) 205 if rating not in [0, 1]: 206 raise ValueError( 207 f"{rating_name.capitalize()} of type pass_fail must be 0 (fail) or 1 (pass)" 208 ) 209 210 def _validate_pass_fail_critical( 211 self, rating: float | None, rating_name: str 212 ) -> None: 213 if rating is None or not isinstance(rating, float) or not rating.is_integer(): 214 raise ValueError( 215 f"{rating_name.capitalize()} of type pass_fail_critical must be an integer value (-1, 0, or 1)" 216 ) 217 if rating not in [-1, 0, 1]: 218 raise ValueError( 219 f"{rating_name.capitalize()} of type pass_fail_critical must be -1 (critical fail), 0 (fail), or 1 (pass)" 220 )
A rating for a task output, including an overall rating and ratings for each requirement.
Supports:
- five_star: 1-5 star ratings
- pass_fail: boolean pass/fail (1.0 = pass, 0.0 = fail)
- pass_fail_critical: tri-state (1.0 = pass, 0.0 = fail, -1.0 = critical fail)
130 @model_validator(mode="before") 131 def upgrade_old_format(cls, data: dict) -> dict: 132 if not isinstance(data, dict): 133 return data 134 135 # Check if we have the old format (dict of floats) 136 req_ratings = data.get("requirement_ratings", {}) 137 if req_ratings and all( 138 isinstance(v, (int, float)) for v in req_ratings.values() 139 ): 140 # Convert each float to a RequirementRating object 141 # all ratings are five star at the point we used this format 142 data["requirement_ratings"] = { 143 k: {"value": v, "type": TaskOutputRatingType.five_star} 144 for k, v in req_ratings.items() 145 } 146 147 return data
150 def is_high_quality(self) -> bool: 151 if self.value is None: 152 return False 153 154 if self.type == TaskOutputRatingType.five_star: 155 return self.value >= 4 156 elif self.type == TaskOutputRatingType.pass_fail: 157 return self.value == 1.0 158 elif self.type == TaskOutputRatingType.pass_fail_critical: 159 return self.value == 1.0 160 return False
162 @model_validator(mode="after") 163 def validate_rating(self) -> Self: 164 if self.type not in TaskOutputRatingType: 165 raise ValueError(f"Invalid rating type: {self.type}") 166 167 # Overall rating is optional 168 if self.value is not None: 169 self._validate_rating(self.type, self.value, "overall rating") 170 171 for req_id, req_rating in self.requirement_ratings.items(): 172 self._validate_rating( 173 req_rating.type, 174 req_rating.value, 175 f"requirement rating for req ID: {req_id}", 176 ) 177 178 return self
Configuration for the model, should be a dictionary conforming to [ConfigDict
][pydantic.config.ConfigDict].
122 def wrapped_model_post_init(self: BaseModel, context: Any, /) -> None: 123 """We need to both initialize private attributes and call the user-defined model_post_init 124 method. 125 """ 126 init_private_attributes(self, context) 127 original_model_post_init(self, context)
We need to both initialize private attributes and call the user-defined model_post_init method.
81class Priority(IntEnum): 82 """Defines priority levels for tasks and requirements, where P0 is highest priority.""" 83 84 p0 = 0 85 p1 = 1 86 p2 = 2 87 p3 = 3
Defines priority levels for tasks and requirements, where P0 is highest priority.
357class DataSource(BaseModel): 358 """ 359 Represents the origin of data, either human or synthetic, with associated properties. 360 361 Properties vary based on the source type - for synthetic sources this includes 362 model information, for human sources this includes creator information. 363 """ 364 365 type: DataSourceType 366 properties: Dict[str, str | int | float] = Field( 367 default={}, 368 description="Properties describing the data source. For synthetic things like model. For human, the human's name.", 369 ) 370 371 _data_source_properties = [ 372 DataSourceProperty( 373 name="created_by", 374 type=str, 375 required_for=[DataSourceType.human], 376 not_allowed_for=[DataSourceType.synthetic], 377 ), 378 DataSourceProperty( 379 name="model_name", 380 type=str, 381 required_for=[DataSourceType.synthetic], 382 not_allowed_for=[DataSourceType.human], 383 ), 384 DataSourceProperty( 385 name="model_provider", 386 type=str, 387 required_for=[DataSourceType.synthetic], 388 not_allowed_for=[DataSourceType.human], 389 ), 390 DataSourceProperty( 391 name="adapter_name", 392 type=str, 393 required_for=[DataSourceType.synthetic], 394 not_allowed_for=[DataSourceType.human], 395 ), 396 DataSourceProperty( 397 name="prompt_builder_name", 398 type=str, 399 not_allowed_for=[DataSourceType.human], 400 ), 401 ] 402 403 @model_validator(mode="after") 404 def validate_type(self) -> "DataSource": 405 if self.type not in DataSourceType: 406 raise ValueError(f"Invalid data source type: {self.type}") 407 return self 408 409 @model_validator(mode="after") 410 def validate_properties(self) -> "DataSource": 411 for prop in self._data_source_properties: 412 # Check the property type is correct 413 if prop.name in self.properties: 414 if not isinstance(self.properties[prop.name], prop.type): 415 raise ValueError( 416 f"'{prop.name}' must be of type {prop.type.__name__} for {self.type} data source" 417 ) 418 # Check the property is required for the data source type 419 if self.type in prop.required_for: 420 if prop.name not in self.properties: 421 raise ValueError( 422 f"'{prop.name}' is required for {self.type} data source" 423 ) 424 # Check the property is not allowed for the data source type 425 elif self.type in prop.not_allowed_for and prop.name in self.properties: 426 raise ValueError( 427 f"'{prop.name}' is not allowed for {self.type} data source" 428 ) 429 return self 430 431 @model_validator(mode="after") 432 def validate_no_empty_properties(self) -> Self: 433 for prop, value in self.properties.items(): 434 if isinstance(value, str) and value == "": 435 raise ValueError( 436 f"Property '{prop}' must be a non-empty string for {self.type} data source" 437 ) 438 return self
Represents the origin of data, either human or synthetic, with associated properties.
Properties vary based on the source type - for synthetic sources this includes model information, for human sources this includes creator information.
409 @model_validator(mode="after") 410 def validate_properties(self) -> "DataSource": 411 for prop in self._data_source_properties: 412 # Check the property type is correct 413 if prop.name in self.properties: 414 if not isinstance(self.properties[prop.name], prop.type): 415 raise ValueError( 416 f"'{prop.name}' must be of type {prop.type.__name__} for {self.type} data source" 417 ) 418 # Check the property is required for the data source type 419 if self.type in prop.required_for: 420 if prop.name not in self.properties: 421 raise ValueError( 422 f"'{prop.name}' is required for {self.type} data source" 423 ) 424 # Check the property is not allowed for the data source type 425 elif self.type in prop.not_allowed_for and prop.name in self.properties: 426 raise ValueError( 427 f"'{prop.name}' is not allowed for {self.type} data source" 428 ) 429 return self
431 @model_validator(mode="after") 432 def validate_no_empty_properties(self) -> Self: 433 for prop, value in self.properties.items(): 434 if isinstance(value, str) and value == "": 435 raise ValueError( 436 f"Property '{prop}' must be a non-empty string for {self.type} data source" 437 ) 438 return self
Configuration for the model, should be a dictionary conforming to [ConfigDict
][pydantic.config.ConfigDict].
384def init_private_attributes(self: BaseModel, context: Any, /) -> None: 385 """This function is meant to behave like a BaseModel method to initialise private attributes. 386 387 It takes context as an argument since that's what pydantic-core passes when calling it. 388 389 Args: 390 self: The BaseModel instance. 391 context: The context. 392 """ 393 if getattr(self, '__pydantic_private__', None) is None: 394 pydantic_private = {} 395 for name, private_attr in self.__private_attributes__.items(): 396 default = private_attr.get_default() 397 if default is not PydanticUndefined: 398 pydantic_private[name] = default 399 object_setattr(self, '__pydantic_private__', pydantic_private)
This function is meant to behave like a BaseModel method to initialise private attributes.
It takes context as an argument since that's what pydantic-core passes when calling it.
Args: self: The BaseModel instance. context: The context.
331class DataSourceType(str, Enum): 332 """ 333 The source type of a piece of data. 334 335 Human: a human created the data 336 Synthetic: a model created the data 337 """ 338 339 human = "human" 340 synthetic = "synthetic"
The source type of a piece of data.
Human: a human created the data Synthetic: a model created the data
343class DataSourceProperty(BaseModel): 344 """ 345 Defines a property that can be associated with a data source. 346 347 Includes validation rules for when properties are required or not allowed 348 based on the data source type. 349 """ 350 351 name: str 352 type: Type[Union[str, int, float]] 353 required_for: List[DataSourceType] = [] 354 not_allowed_for: List[DataSourceType] = []
Defines a property that can be associated with a data source.
Includes validation rules for when properties are required or not allowed based on the data source type.
278class Finetune(KilnParentedModel): 279 name: str = NAME_FIELD 280 description: str | None = Field( 281 default=None, 282 description="A description of the fine-tune for you and your team. Not used in training.", 283 ) 284 provider: str = Field( 285 description="The provider to use for the fine-tune (e.g. 'openai')." 286 ) 287 base_model_id: str = Field( 288 description="The id of the base model to use for the fine-tune. This string relates to the provider's IDs for their own models, not Kiln IDs." 289 ) 290 provider_id: str | None = Field( 291 default=None, 292 description="The ID of the fine-tune job on the provider's side. May not be the same as the fine_tune_model_id.", 293 ) 294 fine_tune_model_id: str | None = Field( 295 default=None, 296 description="The ID of the fine-tuned model on the provider's side. May not be the same as the provider_id.", 297 ) 298 dataset_split_id: str = Field( 299 description="The ID of the dataset split to use for this fine-tune.", 300 ) 301 train_split_name: str = Field( 302 default="train", 303 description="The name of the training split to use for this fine-tune.", 304 ) 305 validation_split_name: str | None = Field( 306 default=None, 307 description="The name of the validation split to use for this fine-tune. Optional.", 308 ) 309 parameters: dict[str, str | int | float | bool] = Field( 310 default={}, 311 description="The parameters to use for this fine-tune. These are provider-specific.", 312 ) 313 system_message: str = Field( 314 description="The system message to use for this fine-tune.", 315 ) 316 latest_status: FineTuneStatusType = Field( 317 default=FineTuneStatusType.unknown, 318 description="The latest known status of this fine-tune. Not updated in real time.", 319 ) 320 properties: Dict[str, str | int | float] = Field( 321 default={}, 322 description="Properties of the fine-tune. Different providers may use different properties.", 323 ) 324 325 def parent_task(self) -> Task | None: 326 if not isinstance(self.parent, Task): 327 return None 328 return self.parent
Base model for Kiln models that have a parent-child relationship. This base class is for child models.
This class provides functionality for managing hierarchical relationships between models, including parent reference handling and file system organization.
Attributes: parent (KilnBaseModel): Reference to the parent model instance. Not persisted, just in memory.
Configuration for the model, should be a dictionary conforming to [ConfigDict
][pydantic.config.ConfigDict].
122 def wrapped_model_post_init(self: BaseModel, context: Any, /) -> None: 123 """We need to both initialize private attributes and call the user-defined model_post_init 124 method. 125 """ 126 init_private_attributes(self, context) 127 original_model_post_init(self, context)
We need to both initialize private attributes and call the user-defined model_post_init method.
Inherited Members
266class FineTuneStatusType(str, Enum): 267 """ 268 The status type of a fine-tune (running, completed, failed, etc). 269 """ 270 271 unknown = "unknown" # server error 272 pending = "pending" 273 running = "running" 274 completed = "completed" 275 failed = "failed"
The status type of a fine-tune (running, completed, failed, etc).
91class TaskOutputRatingType(str, Enum): 92 """Defines the types of rating systems available for task outputs.""" 93 94 five_star = "five_star" 95 pass_fail = "pass_fail" 96 pass_fail_critical = "pass_fail_critical" 97 custom = "custom"
Defines the types of rating systems available for task outputs.
699class TaskRequirement(BaseModel): 700 """ 701 Defines a specific requirement that should be met by task outputs. 702 703 Includes an identifier, name, description, instruction for meeting the requirement, 704 priority level, and rating type (five_star, pass_fail, pass_fail_critical, custom). 705 """ 706 707 id: ID_TYPE = ID_FIELD 708 name: str = SHORT_NAME_FIELD 709 description: str | None = Field(default=None) 710 instruction: str = Field(min_length=1) 711 priority: Priority = Field(default=Priority.p2) 712 type: TaskOutputRatingType = Field(default=TaskOutputRatingType.five_star)
Defines a specific requirement that should be met by task outputs.
Includes an identifier, name, description, instruction for meeting the requirement, priority level, and rating type (five_star, pass_fail, pass_fail_critical, custom).
715class TaskDeterminism(str, Enum): 716 """ 717 Defines how strictly task outputs should match expected results. 718 719 - deterministic: Requires exact matches 720 - semantic_match: Allows different wording with same meaning 721 - flexible: Allows variation in both wording and meaning within requirements 722 """ 723 724 deterministic = "deterministic" # Expect exact match 725 semantic_match = "semantic_match" # Expect same meaning, but flexible on expression of the meaning 726 flexible = "flexible" # Flexible on semantic output. Eval should be custom based on parsing requirements.
Defines how strictly task outputs should match expected results.
- deterministic: Requires exact matches
- semantic_match: Allows different wording with same meaning
- flexible: Allows variation in both wording and meaning within requirements
559class DatasetSplitDefinition(BaseModel): 560 """ 561 A definition of a split in a dataset. 562 563 Example: name="train", description="The training set", percentage=0.8 (80% of the dataset) 564 """ 565 566 name: str = NAME_FIELD 567 description: str | None = Field( 568 default=None, 569 description="A description of the dataset for you and your team. Not used in training.", 570 ) 571 percentage: float = Field( 572 ge=0.0, 573 le=1.0, 574 description="The percentage of the dataset that this split represents (between 0 and 1).", 575 )
A definition of a split in a dataset.
Example: name="train", description="The training set", percentage=0.8 (80% of the dataset)
592class DatasetSplit(KilnParentedModel): 593 """ 594 A collection of task runs, with optional splits (train, test, validation). 595 596 Used to freeze a dataset into train/test/validation splits for repeatable fine-tuning or other tasks. 597 598 Maintains a list of IDs for each split, to avoid data duplication. 599 """ 600 601 name: str = NAME_FIELD 602 description: str | None = Field( 603 default=None, 604 description="A description of the dataset for you and your team. Not used in training.", 605 ) 606 splits: list[DatasetSplitDefinition] = Field( 607 default_factory=list, 608 description="The splits in the dataset.", 609 ) 610 split_contents: dict[str, list[str]] = Field( 611 description="The contents of each split in the dataset. The key is the split name, and the value is a list of task run IDs.", 612 ) 613 614 @model_validator(mode="after") 615 def validate_split_percentages(self) -> "DatasetSplit": 616 total = sum(split.percentage for split in self.splits) 617 if not math.isclose(total, 1.0, rel_tol=1e-9): 618 raise ValueError(f"The sum of split percentages must be 1.0 (got {total})") 619 return self 620 621 @classmethod 622 def from_task( 623 cls, 624 name: str, 625 task: "Task", 626 splits: list[DatasetSplitDefinition], 627 filter: DatasetFilter = AllDatasetFilter, 628 description: str | None = None, 629 ): 630 """ 631 Build a dataset split from a task. 632 """ 633 split_contents = cls.build_split_contents(task, splits, filter) 634 return cls( 635 parent=task, 636 name=name, 637 description=description, 638 splits=splits, 639 split_contents=split_contents, 640 ) 641 642 @classmethod 643 def build_split_contents( 644 cls, 645 task: "Task", 646 splits: list[DatasetSplitDefinition], 647 filter: DatasetFilter, 648 ) -> dict[str, list[str]]: 649 valid_ids = [] 650 for task_run in task.runs(): 651 if filter(task_run): 652 valid_ids.append(task_run.id) 653 654 # Shuffle and split by split percentage 655 random.shuffle(valid_ids) 656 split_contents = {} 657 start_idx = 0 658 remaining_items = len(valid_ids) 659 660 # Handle all splits except the last one 661 for split in splits[:-1]: 662 split_size = round(len(valid_ids) * split.percentage) 663 split_contents[split.name] = valid_ids[start_idx : start_idx + split_size] 664 start_idx += split_size 665 remaining_items -= split_size 666 667 # Last split gets all remaining items (for rounding) 668 if splits: 669 split_contents[splits[-1].name] = valid_ids[start_idx:] 670 671 return split_contents 672 673 def parent_task(self) -> "Task | None": 674 # inline import to avoid circular import 675 from kiln_ai.datamodel import Task 676 677 if not isinstance(self.parent, Task): 678 return None 679 return self.parent 680 681 def missing_count(self) -> int: 682 """ 683 Returns: 684 int: the number of task runs that have an ID persisted in this dataset split, but no longer exist in the dataset 685 """ 686 parent = self.parent_task() 687 if parent is None: 688 raise ValueError("DatasetSplit has no parent task") 689 690 runs = parent.runs() 691 all_ids = set(run.id for run in runs) 692 all_ids_in_splits = set() 693 for ids in self.split_contents.values(): 694 all_ids_in_splits.update(ids) 695 missing = all_ids_in_splits - all_ids 696 return len(missing)
A collection of task runs, with optional splits (train, test, validation).
Used to freeze a dataset into train/test/validation splits for repeatable fine-tuning or other tasks.
Maintains a list of IDs for each split, to avoid data duplication.
614 @model_validator(mode="after") 615 def validate_split_percentages(self) -> "DatasetSplit": 616 total = sum(split.percentage for split in self.splits) 617 if not math.isclose(total, 1.0, rel_tol=1e-9): 618 raise ValueError(f"The sum of split percentages must be 1.0 (got {total})") 619 return self
621 @classmethod 622 def from_task( 623 cls, 624 name: str, 625 task: "Task", 626 splits: list[DatasetSplitDefinition], 627 filter: DatasetFilter = AllDatasetFilter, 628 description: str | None = None, 629 ): 630 """ 631 Build a dataset split from a task. 632 """ 633 split_contents = cls.build_split_contents(task, splits, filter) 634 return cls( 635 parent=task, 636 name=name, 637 description=description, 638 splits=splits, 639 split_contents=split_contents, 640 )
Build a dataset split from a task.
642 @classmethod 643 def build_split_contents( 644 cls, 645 task: "Task", 646 splits: list[DatasetSplitDefinition], 647 filter: DatasetFilter, 648 ) -> dict[str, list[str]]: 649 valid_ids = [] 650 for task_run in task.runs(): 651 if filter(task_run): 652 valid_ids.append(task_run.id) 653 654 # Shuffle and split by split percentage 655 random.shuffle(valid_ids) 656 split_contents = {} 657 start_idx = 0 658 remaining_items = len(valid_ids) 659 660 # Handle all splits except the last one 661 for split in splits[:-1]: 662 split_size = round(len(valid_ids) * split.percentage) 663 split_contents[split.name] = valid_ids[start_idx : start_idx + split_size] 664 start_idx += split_size 665 remaining_items -= split_size 666 667 # Last split gets all remaining items (for rounding) 668 if splits: 669 split_contents[splits[-1].name] = valid_ids[start_idx:] 670 671 return split_contents
681 def missing_count(self) -> int: 682 """ 683 Returns: 684 int: the number of task runs that have an ID persisted in this dataset split, but no longer exist in the dataset 685 """ 686 parent = self.parent_task() 687 if parent is None: 688 raise ValueError("DatasetSplit has no parent task") 689 690 runs = parent.runs() 691 all_ids = set(run.id for run in runs) 692 all_ids_in_splits = set() 693 for ids in self.split_contents.values(): 694 all_ids_in_splits.update(ids) 695 missing = all_ids_in_splits - all_ids 696 return len(missing)
Returns: int: the number of task runs that have an ID persisted in this dataset split, but no longer exist in the dataset
Configuration for the model, should be a dictionary conforming to [ConfigDict
][pydantic.config.ConfigDict].
122 def wrapped_model_post_init(self: BaseModel, context: Any, /) -> None: 123 """We need to both initialize private attributes and call the user-defined model_post_init 124 method. 125 """ 126 init_private_attributes(self, context) 127 original_model_post_init(self, context)
We need to both initialize private attributes and call the user-defined model_post_init method.
Inherited Members
100class RequirementRating(BaseModel): 101 """Rating for a specific requirement within a task output.""" 102 103 value: float = Field( 104 description="The rating value. Interpretation depends on rating type" 105 ) 106 type: TaskOutputRatingType = Field(description="The type of rating")
Rating for a specific requirement within a task output.