kiln_ai.datamodel.extraction

  1import logging
  2from enum import Enum
  3from typing import TYPE_CHECKING, Any, List, Union
  4
  5import anyio
  6from pydantic import (
  7    BaseModel,
  8    Field,
  9    SerializationInfo,
 10    ValidationInfo,
 11    computed_field,
 12    field_serializer,
 13    field_validator,
 14    model_validator,
 15)
 16from typing_extensions import Self
 17
 18from kiln_ai.datamodel.basemodel import (
 19    ID_TYPE,
 20    FilenameString,
 21    KilnAttachmentModel,
 22    KilnParentedModel,
 23    KilnParentModel,
 24)
 25from kiln_ai.datamodel.chunk import ChunkedDocument
 26
 27logger = logging.getLogger(__name__)
 28
 29if TYPE_CHECKING:
 30    from kiln_ai.datamodel.project import Project
 31
 32logger = logging.getLogger(__name__)
 33
 34
 35class Kind(str, Enum):
 36    DOCUMENT = "document"
 37    IMAGE = "image"
 38    VIDEO = "video"
 39    AUDIO = "audio"
 40
 41
 42class OutputFormat(str, Enum):
 43    TEXT = "text/plain"
 44    MARKDOWN = "text/markdown"
 45
 46
 47class ExtractorType(str, Enum):
 48    LITELLM = "litellm"
 49
 50
 51SUPPORTED_MIME_TYPES = {
 52    Kind.DOCUMENT: {
 53        "application/pdf",
 54        "text/plain",
 55        "text/markdown",
 56        "text/html",
 57        "text/md",
 58    },
 59    Kind.IMAGE: {
 60        "image/png",
 61        "image/jpeg",
 62    },
 63    Kind.VIDEO: {
 64        "video/mp4",
 65        "video/quicktime",
 66    },
 67    Kind.AUDIO: {
 68        "audio/wav",
 69        "audio/mpeg",
 70        "audio/ogg",
 71    },
 72}
 73
 74
 75class ExtractionModel(BaseModel):
 76    name: str
 77    label: str
 78
 79
 80def validate_prompt(prompt: Any, name: str):
 81    if not isinstance(prompt, str):
 82        raise ValueError(f"{name} must be a string.")
 83    if prompt == "":
 84        raise ValueError(f"{name} cannot be empty.")
 85
 86
 87class ExtractionSource(str, Enum):
 88    PROCESSED = "processed"
 89    PASSTHROUGH = "passthrough"
 90
 91
 92class Extraction(
 93    KilnParentedModel, KilnParentModel, parent_of={"chunked_documents": ChunkedDocument}
 94):
 95    source: ExtractionSource = Field(
 96        description="The source of the extraction.",
 97    )
 98    extractor_config_id: ID_TYPE = Field(
 99        description="The ID of the extractor config used to extract the data.",
100    )
101    output: KilnAttachmentModel = Field(
102        description="The extraction output.",
103    )
104
105    def parent_document(self) -> Union["Document", None]:
106        if self.parent is None or self.parent.__class__.__name__ != "Document":
107            return None
108        return self.parent  # type: ignore
109
110    async def output_content(self) -> str | None:
111        if not self.path:
112            raise ValueError(
113                "Failed to resolve the path of extraction output attachment because the extraction does not have a path."
114            )
115
116        full_path = self.output.resolve_path(self.path.parent)
117
118        try:
119            return await anyio.Path(full_path).read_text(encoding="utf-8")
120        except Exception as e:
121            logger.error(
122                f"Failed to read extraction output for {full_path}: {e}", exc_info=True
123            )
124            raise ValueError(f"Failed to read extraction output: {e}")
125
126    def chunked_documents(self, readonly: bool = False) -> list[ChunkedDocument]:
127        return super().chunked_documents(readonly=readonly)  # type: ignore
128
129
130class ExtractorConfig(KilnParentedModel):
131    name: FilenameString = Field(
132        description="A name to identify the extractor config.",
133    )
134    is_archived: bool = Field(
135        default=False,
136        description="Whether the extractor config is archived. Archived extractor configs are not shown in the UI and are not available for use.",
137    )
138    description: str | None = Field(
139        default=None, description="The description of the extractor config"
140    )
141    model_provider_name: str = Field(
142        description="The name of the model provider to use for the extractor config.",
143    )
144    model_name: str = Field(
145        description="The name of the model to use for the extractor config.",
146    )
147    output_format: OutputFormat = Field(
148        default=OutputFormat.MARKDOWN,
149        description="The format to use for the output.",
150    )
151    passthrough_mimetypes: list[OutputFormat] = Field(
152        default_factory=list,
153        description="If the mimetype is in this list, the extractor will not be used and the text content of the file will be returned as is.",
154    )
155    extractor_type: ExtractorType = Field(
156        description="This is used to determine the type of extractor to use.",
157    )
158    properties: dict[str, str | int | float | bool | dict[str, str] | None] = Field(
159        default_factory=dict,
160        description="Properties to be used to execute the extractor config. This is extractor_type specific and should serialize to a json dict.",
161    )
162
163    @field_validator("properties")
164    @classmethod
165    def validate_properties(
166        cls, properties: dict[str, Any], info: ValidationInfo
167    ) -> dict[str, Any]:
168        def get_property(key: str) -> str:
169            value = properties.get(key)
170            if value is None or value == "" or not isinstance(value, str):
171                raise ValueError(f"Prompt for {key} must be a string")
172            return value
173
174        return {
175            "prompt_document": get_property(
176                "prompt_document",
177            ),
178            "prompt_image": get_property(
179                "prompt_image",
180            ),
181            "prompt_video": get_property(
182                "prompt_video",
183            ),
184            "prompt_audio": get_property(
185                "prompt_audio",
186            ),
187        }
188
189    def prompt_document(self) -> str | None:
190        prompt = self.properties.get("prompt_document")
191        if prompt is None:
192            return None
193        if not isinstance(prompt, str):
194            raise ValueError(
195                "Invalid prompt_document. prompt_document must be a string."
196            )
197        return prompt
198
199    def prompt_video(self) -> str | None:
200        prompt = self.properties.get("prompt_video")
201        if prompt is None:
202            return None
203        if not isinstance(prompt, str):
204            raise ValueError("Invalid prompt_video. prompt_video must be a string.")
205        return prompt
206
207    def prompt_audio(self) -> str | None:
208        prompt = self.properties.get("prompt_audio")
209        if prompt is None:
210            return None
211        if not isinstance(prompt, str):
212            raise ValueError("Invalid prompt_audio. prompt_audio must be a string.")
213        return prompt
214
215    def prompt_image(self) -> str | None:
216        prompt = self.properties.get("prompt_image")
217        if prompt is None:
218            return None
219        if not isinstance(prompt, str):
220            raise ValueError("Invalid prompt_image. prompt_image must be a string.")
221        return prompt
222
223    # Workaround to return typed parent without importing Project
224    def parent_project(self) -> Union["Project", None]:
225        if self.parent is None or self.parent.__class__.__name__ != "Project":
226            return None
227        return self.parent  # type: ignore
228
229
230class FileInfo(BaseModel):
231    filename: str = Field(description="The filename of the file")
232
233    size: int = Field(description="The size of the file in bytes")
234
235    mime_type: str = Field(description="The MIME type of the file")
236
237    attachment: KilnAttachmentModel = Field(
238        description="The attachment to the file",
239    )
240
241    @field_serializer("attachment")
242    def serialize_attachment(
243        self, attachment: KilnAttachmentModel, info: SerializationInfo
244    ) -> dict:
245        context = info.context or {}
246        context["filename_prefix"] = "attachment"
247        return attachment.model_dump(mode="json", context=context)
248
249    @field_validator("mime_type")
250    @classmethod
251    def validate_mime_type(cls, mime_type: str, info: ValidationInfo) -> str:
252        filename = info.data.get("filename") or ""
253
254        for mime_types in SUPPORTED_MIME_TYPES.values():
255            if mime_type in mime_types:
256                return mime_type
257        raise ValueError(f"MIME type is not supported: {mime_type} (for {filename})")
258
259
260class Document(
261    KilnParentedModel, KilnParentModel, parent_of={"extractions": Extraction}
262):
263    # this field should not be changed after creation
264    name: FilenameString = Field(
265        description="A name to identify the document.",
266    )
267
268    # this field can be changed after creation
269    name_override: str | None = Field(
270        description="A friendly name to identify the document. This is used for display purposes and can be different from the name.",
271        default=None,
272    )
273
274    description: str = Field(description="A description for the file")
275
276    original_file: FileInfo = Field(description="The original file")
277
278    kind: Kind = Field(
279        description="The kind of document. The kind is a broad family of filetypes that can be handled in a similar way"
280    )
281
282    tags: List[str] = Field(
283        default_factory=list,
284        description="Tags for the document. Tags are used to categorize documents for filtering and reporting.",
285    )
286
287    @model_validator(mode="after")
288    def validate_tags(self) -> Self:
289        for tag in self.tags:
290            if not tag:
291                raise ValueError("Tags cannot be empty strings")
292            if " " in tag:
293                raise ValueError("Tags cannot contain spaces. Try underscores.")
294
295        return self
296
297    # Workaround to return typed parent without importing Project
298    def parent_project(self) -> Union["Project", None]:
299        if self.parent is None or self.parent.__class__.__name__ != "Project":
300            return None
301        return self.parent  # type: ignore
302
303    def extractions(self, readonly: bool = False) -> list[Extraction]:
304        return super().extractions(readonly=readonly)  # type: ignore
305
306    @computed_field
307    @property
308    def friendly_name(self) -> str:
309        # backward compatibility: old documents did not have name_override
310        return self.name_override or self.name
311
312
313def get_kind_from_mime_type(mime_type: str) -> Kind | None:
314    for kind, mime_types in SUPPORTED_MIME_TYPES.items():
315        if mime_type in mime_types:
316            return kind
317    return None
logger = <Logger kiln_ai.datamodel.extraction (WARNING)>
class Kind(builtins.str, enum.Enum):
36class Kind(str, Enum):
37    DOCUMENT = "document"
38    IMAGE = "image"
39    VIDEO = "video"
40    AUDIO = "audio"

str(object='') -> str str(bytes_or_buffer[, encoding[, errors]]) -> str

Create a new string object from the given object. If encoding or errors is specified, then the object must expose a data buffer that will be decoded using the given encoding and error handler. Otherwise, returns the result of object.__str__() (if defined) or repr(object). encoding defaults to 'utf-8'. errors defaults to 'strict'.

DOCUMENT = <Kind.DOCUMENT: 'document'>
IMAGE = <Kind.IMAGE: 'image'>
VIDEO = <Kind.VIDEO: 'video'>
AUDIO = <Kind.AUDIO: 'audio'>
class OutputFormat(builtins.str, enum.Enum):
43class OutputFormat(str, Enum):
44    TEXT = "text/plain"
45    MARKDOWN = "text/markdown"

str(object='') -> str str(bytes_or_buffer[, encoding[, errors]]) -> str

Create a new string object from the given object. If encoding or errors is specified, then the object must expose a data buffer that will be decoded using the given encoding and error handler. Otherwise, returns the result of object.__str__() (if defined) or repr(object). encoding defaults to 'utf-8'. errors defaults to 'strict'.

TEXT = <OutputFormat.TEXT: 'text/plain'>
MARKDOWN = <OutputFormat.MARKDOWN: 'text/markdown'>
class ExtractorType(builtins.str, enum.Enum):
48class ExtractorType(str, Enum):
49    LITELLM = "litellm"

str(object='') -> str str(bytes_or_buffer[, encoding[, errors]]) -> str

Create a new string object from the given object. If encoding or errors is specified, then the object must expose a data buffer that will be decoded using the given encoding and error handler. Otherwise, returns the result of object.__str__() (if defined) or repr(object). encoding defaults to 'utf-8'. errors defaults to 'strict'.

LITELLM = <ExtractorType.LITELLM: 'litellm'>
SUPPORTED_MIME_TYPES = {<Kind.DOCUMENT: 'document'>: {'text/plain', 'text/html', 'text/md', 'application/pdf', 'text/markdown'}, <Kind.IMAGE: 'image'>: {'image/png', 'image/jpeg'}, <Kind.VIDEO: 'video'>: {'video/quicktime', 'video/mp4'}, <Kind.AUDIO: 'audio'>: {'audio/wav', 'audio/ogg', 'audio/mpeg'}}
class ExtractionModel(pydantic.main.BaseModel):
76class ExtractionModel(BaseModel):
77    name: str
78    label: str

!!! abstract "Usage Documentation" Models

A base class for creating Pydantic models.

Attributes: __class_vars__: The names of the class variables defined on the model. __private_attributes__: Metadata about the private attributes of the model. __signature__: The synthesized __init__ [Signature][inspect.Signature] of the model.

__pydantic_complete__: Whether model building is completed, or if there are still undefined fields.
__pydantic_core_schema__: The core schema of the model.
__pydantic_custom_init__: Whether the model has a custom `__init__` function.
__pydantic_decorators__: Metadata containing the decorators defined on the model.
    This replaces `Model.__validators__` and `Model.__root_validators__` from Pydantic V1.
__pydantic_generic_metadata__: Metadata for generic models; contains data used for a similar purpose to
    __args__, __origin__, __parameters__ in typing-module generics. May eventually be replaced by these.
__pydantic_parent_namespace__: Parent namespace of the model, used for automatic rebuilding of models.
__pydantic_post_init__: The name of the post-init method for the model, if defined.
__pydantic_root_model__: Whether the model is a [`RootModel`][pydantic.root_model.RootModel].
__pydantic_serializer__: The `pydantic-core` `SchemaSerializer` used to dump instances of the model.
__pydantic_validator__: The `pydantic-core` `SchemaValidator` used to validate instances of the model.

__pydantic_fields__: A dictionary of field names and their corresponding [`FieldInfo`][pydantic.fields.FieldInfo] objects.
__pydantic_computed_fields__: A dictionary of computed field names and their corresponding [`ComputedFieldInfo`][pydantic.fields.ComputedFieldInfo] objects.

__pydantic_extra__: A dictionary containing extra values, if [`extra`][pydantic.config.ConfigDict.extra]
    is set to `'allow'`.
__pydantic_fields_set__: The names of fields explicitly set during instantiation.
__pydantic_private__: Values of private attributes set on the model instance.
name: str
label: str
model_config: ClassVar[pydantic.config.ConfigDict] = {}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

def validate_prompt(prompt: Any, name: str):
81def validate_prompt(prompt: Any, name: str):
82    if not isinstance(prompt, str):
83        raise ValueError(f"{name} must be a string.")
84    if prompt == "":
85        raise ValueError(f"{name} cannot be empty.")
class ExtractionSource(builtins.str, enum.Enum):
88class ExtractionSource(str, Enum):
89    PROCESSED = "processed"
90    PASSTHROUGH = "passthrough"

str(object='') -> str str(bytes_or_buffer[, encoding[, errors]]) -> str

Create a new string object from the given object. If encoding or errors is specified, then the object must expose a data buffer that will be decoded using the given encoding and error handler. Otherwise, returns the result of object.__str__() (if defined) or repr(object). encoding defaults to 'utf-8'. errors defaults to 'strict'.

PROCESSED = <ExtractionSource.PROCESSED: 'processed'>
PASSTHROUGH = <ExtractionSource.PASSTHROUGH: 'passthrough'>
class Extraction(kiln_ai.datamodel.basemodel.KilnParentedModel, kiln_ai.datamodel.basemodel.KilnParentModel):
 93class Extraction(
 94    KilnParentedModel, KilnParentModel, parent_of={"chunked_documents": ChunkedDocument}
 95):
 96    source: ExtractionSource = Field(
 97        description="The source of the extraction.",
 98    )
 99    extractor_config_id: ID_TYPE = Field(
100        description="The ID of the extractor config used to extract the data.",
101    )
102    output: KilnAttachmentModel = Field(
103        description="The extraction output.",
104    )
105
106    def parent_document(self) -> Union["Document", None]:
107        if self.parent is None or self.parent.__class__.__name__ != "Document":
108            return None
109        return self.parent  # type: ignore
110
111    async def output_content(self) -> str | None:
112        if not self.path:
113            raise ValueError(
114                "Failed to resolve the path of extraction output attachment because the extraction does not have a path."
115            )
116
117        full_path = self.output.resolve_path(self.path.parent)
118
119        try:
120            return await anyio.Path(full_path).read_text(encoding="utf-8")
121        except Exception as e:
122            logger.error(
123                f"Failed to read extraction output for {full_path}: {e}", exc_info=True
124            )
125            raise ValueError(f"Failed to read extraction output: {e}")
126
127    def chunked_documents(self, readonly: bool = False) -> list[ChunkedDocument]:
128        return super().chunked_documents(readonly=readonly)  # type: ignore

Base model for Kiln models that have a parent-child relationship. This base class is for child models.

This class provides functionality for managing hierarchical relationships between models, including parent reference handling and file system organization.

Attributes: parent (KilnBaseModel): Reference to the parent model instance. Not persisted, just in memory.

extractor_config_id: Optional[str]
output: kiln_ai.datamodel.basemodel.KilnAttachmentModel
def parent_document(self) -> Optional[Document]:
106    def parent_document(self) -> Union["Document", None]:
107        if self.parent is None or self.parent.__class__.__name__ != "Document":
108            return None
109        return self.parent  # type: ignore
async def output_content(self) -> str | None:
111    async def output_content(self) -> str | None:
112        if not self.path:
113            raise ValueError(
114                "Failed to resolve the path of extraction output attachment because the extraction does not have a path."
115            )
116
117        full_path = self.output.resolve_path(self.path.parent)
118
119        try:
120            return await anyio.Path(full_path).read_text(encoding="utf-8")
121        except Exception as e:
122            logger.error(
123                f"Failed to read extraction output for {full_path}: {e}", exc_info=True
124            )
125            raise ValueError(f"Failed to read extraction output: {e}")
def chunked_documents(self, readonly=False) -> List[kiln_ai.datamodel.chunk.ChunkedDocument]:
643        def child_method(self, readonly: bool = False) -> list[child_class]:
644            return child_class.all_children_of_parent_path(self.path, readonly=readonly)

The type of the None singleton.

def relationship_name() -> str:
661        def relationship_name_method() -> str:
662            return relationship_name

The type of the None singleton.

def parent_type() -> Type[kiln_ai.datamodel.basemodel.KilnParentModel]:
654        def parent_class_method() -> Type[KilnParentModel]:
655            return cls

The type of the None singleton.

model_config = {'validate_assignment': True}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

def model_post_init(self: pydantic.main.BaseModel, context: Any, /) -> None:
337def init_private_attributes(self: BaseModel, context: Any, /) -> None:
338    """This function is meant to behave like a BaseModel method to initialise private attributes.
339
340    It takes context as an argument since that's what pydantic-core passes when calling it.
341
342    Args:
343        self: The BaseModel instance.
344        context: The context.
345    """
346    if getattr(self, '__pydantic_private__', None) is None:
347        pydantic_private = {}
348        for name, private_attr in self.__private_attributes__.items():
349            default = private_attr.get_default()
350            if default is not PydanticUndefined:
351                pydantic_private[name] = default
352        object_setattr(self, '__pydantic_private__', pydantic_private)

This function is meant to behave like a BaseModel method to initialise private attributes.

It takes context as an argument since that's what pydantic-core passes when calling it.

Args: self: The BaseModel instance. context: The context.

class ExtractorConfig(kiln_ai.datamodel.basemodel.KilnParentedModel):
131class ExtractorConfig(KilnParentedModel):
132    name: FilenameString = Field(
133        description="A name to identify the extractor config.",
134    )
135    is_archived: bool = Field(
136        default=False,
137        description="Whether the extractor config is archived. Archived extractor configs are not shown in the UI and are not available for use.",
138    )
139    description: str | None = Field(
140        default=None, description="The description of the extractor config"
141    )
142    model_provider_name: str = Field(
143        description="The name of the model provider to use for the extractor config.",
144    )
145    model_name: str = Field(
146        description="The name of the model to use for the extractor config.",
147    )
148    output_format: OutputFormat = Field(
149        default=OutputFormat.MARKDOWN,
150        description="The format to use for the output.",
151    )
152    passthrough_mimetypes: list[OutputFormat] = Field(
153        default_factory=list,
154        description="If the mimetype is in this list, the extractor will not be used and the text content of the file will be returned as is.",
155    )
156    extractor_type: ExtractorType = Field(
157        description="This is used to determine the type of extractor to use.",
158    )
159    properties: dict[str, str | int | float | bool | dict[str, str] | None] = Field(
160        default_factory=dict,
161        description="Properties to be used to execute the extractor config. This is extractor_type specific and should serialize to a json dict.",
162    )
163
164    @field_validator("properties")
165    @classmethod
166    def validate_properties(
167        cls, properties: dict[str, Any], info: ValidationInfo
168    ) -> dict[str, Any]:
169        def get_property(key: str) -> str:
170            value = properties.get(key)
171            if value is None or value == "" or not isinstance(value, str):
172                raise ValueError(f"Prompt for {key} must be a string")
173            return value
174
175        return {
176            "prompt_document": get_property(
177                "prompt_document",
178            ),
179            "prompt_image": get_property(
180                "prompt_image",
181            ),
182            "prompt_video": get_property(
183                "prompt_video",
184            ),
185            "prompt_audio": get_property(
186                "prompt_audio",
187            ),
188        }
189
190    def prompt_document(self) -> str | None:
191        prompt = self.properties.get("prompt_document")
192        if prompt is None:
193            return None
194        if not isinstance(prompt, str):
195            raise ValueError(
196                "Invalid prompt_document. prompt_document must be a string."
197            )
198        return prompt
199
200    def prompt_video(self) -> str | None:
201        prompt = self.properties.get("prompt_video")
202        if prompt is None:
203            return None
204        if not isinstance(prompt, str):
205            raise ValueError("Invalid prompt_video. prompt_video must be a string.")
206        return prompt
207
208    def prompt_audio(self) -> str | None:
209        prompt = self.properties.get("prompt_audio")
210        if prompt is None:
211            return None
212        if not isinstance(prompt, str):
213            raise ValueError("Invalid prompt_audio. prompt_audio must be a string.")
214        return prompt
215
216    def prompt_image(self) -> str | None:
217        prompt = self.properties.get("prompt_image")
218        if prompt is None:
219            return None
220        if not isinstance(prompt, str):
221            raise ValueError("Invalid prompt_image. prompt_image must be a string.")
222        return prompt
223
224    # Workaround to return typed parent without importing Project
225    def parent_project(self) -> Union["Project", None]:
226        if self.parent is None or self.parent.__class__.__name__ != "Project":
227            return None
228        return self.parent  # type: ignore

Base model for Kiln models that have a parent-child relationship. This base class is for child models.

This class provides functionality for managing hierarchical relationships between models, including parent reference handling and file system organization.

Attributes: parent (KilnBaseModel): Reference to the parent model instance. Not persisted, just in memory.

name: Annotated[str, BeforeValidator(func=<function name_validator.<locals>.fn at 0x7fc0765f0900>, json_schema_input_type=PydanticUndefined)]
is_archived: bool
description: str | None
model_provider_name: str
model_name: str
output_format: OutputFormat
passthrough_mimetypes: list[OutputFormat]
extractor_type: ExtractorType
properties: dict[str, str | int | float | bool | dict[str, str] | None]
@field_validator('properties')
@classmethod
def validate_properties( cls, properties: dict[str, typing.Any], info: pydantic_core.core_schema.ValidationInfo) -> dict[str, typing.Any]:
164    @field_validator("properties")
165    @classmethod
166    def validate_properties(
167        cls, properties: dict[str, Any], info: ValidationInfo
168    ) -> dict[str, Any]:
169        def get_property(key: str) -> str:
170            value = properties.get(key)
171            if value is None or value == "" or not isinstance(value, str):
172                raise ValueError(f"Prompt for {key} must be a string")
173            return value
174
175        return {
176            "prompt_document": get_property(
177                "prompt_document",
178            ),
179            "prompt_image": get_property(
180                "prompt_image",
181            ),
182            "prompt_video": get_property(
183                "prompt_video",
184            ),
185            "prompt_audio": get_property(
186                "prompt_audio",
187            ),
188        }
def prompt_document(self) -> str | None:
190    def prompt_document(self) -> str | None:
191        prompt = self.properties.get("prompt_document")
192        if prompt is None:
193            return None
194        if not isinstance(prompt, str):
195            raise ValueError(
196                "Invalid prompt_document. prompt_document must be a string."
197            )
198        return prompt
def prompt_video(self) -> str | None:
200    def prompt_video(self) -> str | None:
201        prompt = self.properties.get("prompt_video")
202        if prompt is None:
203            return None
204        if not isinstance(prompt, str):
205            raise ValueError("Invalid prompt_video. prompt_video must be a string.")
206        return prompt
def prompt_audio(self) -> str | None:
208    def prompt_audio(self) -> str | None:
209        prompt = self.properties.get("prompt_audio")
210        if prompt is None:
211            return None
212        if not isinstance(prompt, str):
213            raise ValueError("Invalid prompt_audio. prompt_audio must be a string.")
214        return prompt
def prompt_image(self) -> str | None:
216    def prompt_image(self) -> str | None:
217        prompt = self.properties.get("prompt_image")
218        if prompt is None:
219            return None
220        if not isinstance(prompt, str):
221            raise ValueError("Invalid prompt_image. prompt_image must be a string.")
222        return prompt
def parent_project(self) -> Optional[kiln_ai.datamodel.Project]:
225    def parent_project(self) -> Union["Project", None]:
226        if self.parent is None or self.parent.__class__.__name__ != "Project":
227            return None
228        return self.parent  # type: ignore
def relationship_name() -> str:
661        def relationship_name_method() -> str:
662            return relationship_name

The type of the None singleton.

def parent_type() -> Type[kiln_ai.datamodel.basemodel.KilnParentModel]:
654        def parent_class_method() -> Type[KilnParentModel]:
655            return cls

The type of the None singleton.

model_config = {'validate_assignment': True}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

def model_post_init(self: pydantic.main.BaseModel, context: Any, /) -> None:
337def init_private_attributes(self: BaseModel, context: Any, /) -> None:
338    """This function is meant to behave like a BaseModel method to initialise private attributes.
339
340    It takes context as an argument since that's what pydantic-core passes when calling it.
341
342    Args:
343        self: The BaseModel instance.
344        context: The context.
345    """
346    if getattr(self, '__pydantic_private__', None) is None:
347        pydantic_private = {}
348        for name, private_attr in self.__private_attributes__.items():
349            default = private_attr.get_default()
350            if default is not PydanticUndefined:
351                pydantic_private[name] = default
352        object_setattr(self, '__pydantic_private__', pydantic_private)

This function is meant to behave like a BaseModel method to initialise private attributes.

It takes context as an argument since that's what pydantic-core passes when calling it.

Args: self: The BaseModel instance. context: The context.

class FileInfo(pydantic.main.BaseModel):
231class FileInfo(BaseModel):
232    filename: str = Field(description="The filename of the file")
233
234    size: int = Field(description="The size of the file in bytes")
235
236    mime_type: str = Field(description="The MIME type of the file")
237
238    attachment: KilnAttachmentModel = Field(
239        description="The attachment to the file",
240    )
241
242    @field_serializer("attachment")
243    def serialize_attachment(
244        self, attachment: KilnAttachmentModel, info: SerializationInfo
245    ) -> dict:
246        context = info.context or {}
247        context["filename_prefix"] = "attachment"
248        return attachment.model_dump(mode="json", context=context)
249
250    @field_validator("mime_type")
251    @classmethod
252    def validate_mime_type(cls, mime_type: str, info: ValidationInfo) -> str:
253        filename = info.data.get("filename") or ""
254
255        for mime_types in SUPPORTED_MIME_TYPES.values():
256            if mime_type in mime_types:
257                return mime_type
258        raise ValueError(f"MIME type is not supported: {mime_type} (for {filename})")

!!! abstract "Usage Documentation" Models

A base class for creating Pydantic models.

Attributes: __class_vars__: The names of the class variables defined on the model. __private_attributes__: Metadata about the private attributes of the model. __signature__: The synthesized __init__ [Signature][inspect.Signature] of the model.

__pydantic_complete__: Whether model building is completed, or if there are still undefined fields.
__pydantic_core_schema__: The core schema of the model.
__pydantic_custom_init__: Whether the model has a custom `__init__` function.
__pydantic_decorators__: Metadata containing the decorators defined on the model.
    This replaces `Model.__validators__` and `Model.__root_validators__` from Pydantic V1.
__pydantic_generic_metadata__: Metadata for generic models; contains data used for a similar purpose to
    __args__, __origin__, __parameters__ in typing-module generics. May eventually be replaced by these.
__pydantic_parent_namespace__: Parent namespace of the model, used for automatic rebuilding of models.
__pydantic_post_init__: The name of the post-init method for the model, if defined.
__pydantic_root_model__: Whether the model is a [`RootModel`][pydantic.root_model.RootModel].
__pydantic_serializer__: The `pydantic-core` `SchemaSerializer` used to dump instances of the model.
__pydantic_validator__: The `pydantic-core` `SchemaValidator` used to validate instances of the model.

__pydantic_fields__: A dictionary of field names and their corresponding [`FieldInfo`][pydantic.fields.FieldInfo] objects.
__pydantic_computed_fields__: A dictionary of computed field names and their corresponding [`ComputedFieldInfo`][pydantic.fields.ComputedFieldInfo] objects.

__pydantic_extra__: A dictionary containing extra values, if [`extra`][pydantic.config.ConfigDict.extra]
    is set to `'allow'`.
__pydantic_fields_set__: The names of fields explicitly set during instantiation.
__pydantic_private__: Values of private attributes set on the model instance.
filename: str
size: int
mime_type: str
attachment: kiln_ai.datamodel.basemodel.KilnAttachmentModel
@field_serializer('attachment')
def serialize_attachment( self, attachment: kiln_ai.datamodel.basemodel.KilnAttachmentModel, info: pydantic_core.core_schema.SerializationInfo) -> dict:
242    @field_serializer("attachment")
243    def serialize_attachment(
244        self, attachment: KilnAttachmentModel, info: SerializationInfo
245    ) -> dict:
246        context = info.context or {}
247        context["filename_prefix"] = "attachment"
248        return attachment.model_dump(mode="json", context=context)
@field_validator('mime_type')
@classmethod
def validate_mime_type( cls, mime_type: str, info: pydantic_core.core_schema.ValidationInfo) -> str:
250    @field_validator("mime_type")
251    @classmethod
252    def validate_mime_type(cls, mime_type: str, info: ValidationInfo) -> str:
253        filename = info.data.get("filename") or ""
254
255        for mime_types in SUPPORTED_MIME_TYPES.values():
256            if mime_type in mime_types:
257                return mime_type
258        raise ValueError(f"MIME type is not supported: {mime_type} (for {filename})")
model_config: ClassVar[pydantic.config.ConfigDict] = {}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

class Document(kiln_ai.datamodel.basemodel.KilnParentedModel, kiln_ai.datamodel.basemodel.KilnParentModel):
261class Document(
262    KilnParentedModel, KilnParentModel, parent_of={"extractions": Extraction}
263):
264    # this field should not be changed after creation
265    name: FilenameString = Field(
266        description="A name to identify the document.",
267    )
268
269    # this field can be changed after creation
270    name_override: str | None = Field(
271        description="A friendly name to identify the document. This is used for display purposes and can be different from the name.",
272        default=None,
273    )
274
275    description: str = Field(description="A description for the file")
276
277    original_file: FileInfo = Field(description="The original file")
278
279    kind: Kind = Field(
280        description="The kind of document. The kind is a broad family of filetypes that can be handled in a similar way"
281    )
282
283    tags: List[str] = Field(
284        default_factory=list,
285        description="Tags for the document. Tags are used to categorize documents for filtering and reporting.",
286    )
287
288    @model_validator(mode="after")
289    def validate_tags(self) -> Self:
290        for tag in self.tags:
291            if not tag:
292                raise ValueError("Tags cannot be empty strings")
293            if " " in tag:
294                raise ValueError("Tags cannot contain spaces. Try underscores.")
295
296        return self
297
298    # Workaround to return typed parent without importing Project
299    def parent_project(self) -> Union["Project", None]:
300        if self.parent is None or self.parent.__class__.__name__ != "Project":
301            return None
302        return self.parent  # type: ignore
303
304    def extractions(self, readonly: bool = False) -> list[Extraction]:
305        return super().extractions(readonly=readonly)  # type: ignore
306
307    @computed_field
308    @property
309    def friendly_name(self) -> str:
310        # backward compatibility: old documents did not have name_override
311        return self.name_override or self.name

Base model for Kiln models that have a parent-child relationship. This base class is for child models.

This class provides functionality for managing hierarchical relationships between models, including parent reference handling and file system organization.

Attributes: parent (KilnBaseModel): Reference to the parent model instance. Not persisted, just in memory.

name: Annotated[str, BeforeValidator(func=<function name_validator.<locals>.fn at 0x7fc0765f0900>, json_schema_input_type=PydanticUndefined)]
name_override: str | None
description: str
original_file: FileInfo
kind: Kind
tags: List[str]
@model_validator(mode='after')
def validate_tags(self) -> Self:
288    @model_validator(mode="after")
289    def validate_tags(self) -> Self:
290        for tag in self.tags:
291            if not tag:
292                raise ValueError("Tags cannot be empty strings")
293            if " " in tag:
294                raise ValueError("Tags cannot contain spaces. Try underscores.")
295
296        return self
def parent_project(self) -> Optional[kiln_ai.datamodel.Project]:
299    def parent_project(self) -> Union["Project", None]:
300        if self.parent is None or self.parent.__class__.__name__ != "Project":
301            return None
302        return self.parent  # type: ignore
def extractions(self, readonly=False) -> List[Extraction]:
643        def child_method(self, readonly: bool = False) -> list[child_class]:
644            return child_class.all_children_of_parent_path(self.path, readonly=readonly)

The type of the None singleton.

friendly_name: str
307    @computed_field
308    @property
309    def friendly_name(self) -> str:
310        # backward compatibility: old documents did not have name_override
311        return self.name_override or self.name
def relationship_name() -> str:
661        def relationship_name_method() -> str:
662            return relationship_name

The type of the None singleton.

def parent_type() -> Type[kiln_ai.datamodel.basemodel.KilnParentModel]:
654        def parent_class_method() -> Type[KilnParentModel]:
655            return cls

The type of the None singleton.

model_config = {'validate_assignment': True}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

def model_post_init(self: pydantic.main.BaseModel, context: Any, /) -> None:
337def init_private_attributes(self: BaseModel, context: Any, /) -> None:
338    """This function is meant to behave like a BaseModel method to initialise private attributes.
339
340    It takes context as an argument since that's what pydantic-core passes when calling it.
341
342    Args:
343        self: The BaseModel instance.
344        context: The context.
345    """
346    if getattr(self, '__pydantic_private__', None) is None:
347        pydantic_private = {}
348        for name, private_attr in self.__private_attributes__.items():
349            default = private_attr.get_default()
350            if default is not PydanticUndefined:
351                pydantic_private[name] = default
352        object_setattr(self, '__pydantic_private__', pydantic_private)

This function is meant to behave like a BaseModel method to initialise private attributes.

It takes context as an argument since that's what pydantic-core passes when calling it.

Args: self: The BaseModel instance. context: The context.

def get_kind_from_mime_type(mime_type: str) -> Kind | None:
314def get_kind_from_mime_type(mime_type: str) -> Kind | None:
315    for kind, mime_types in SUPPORTED_MIME_TYPES.items():
316        if mime_type in mime_types:
317            return kind
318    return None