kiln_ai.datamodel.extraction

  1import logging
  2from enum import Enum
  3from typing import TYPE_CHECKING, List, Literal, Union
  4
  5import anyio
  6from pydantic import (
  7    BaseModel,
  8    Field,
  9    SerializationInfo,
 10    ValidationInfo,
 11    computed_field,
 12    field_serializer,
 13    field_validator,
 14    model_validator,
 15)
 16from typing_extensions import Self, TypedDict
 17
 18from kiln_ai.datamodel.basemodel import (
 19    ID_TYPE,
 20    FilenameString,
 21    KilnAttachmentModel,
 22    KilnParentedModel,
 23    KilnParentModel,
 24)
 25from kiln_ai.datamodel.chunk import ChunkedDocument
 26from kiln_ai.utils.validation import NonEmptyString
 27
 28logger = logging.getLogger(__name__)
 29
 30if TYPE_CHECKING:
 31    from kiln_ai.datamodel.project import Project
 32
 33logger = logging.getLogger(__name__)
 34
 35
 36class Kind(str, Enum):
 37    """The kind of content a document contains."""
 38
 39    DOCUMENT = "document"
 40    IMAGE = "image"
 41    VIDEO = "video"
 42    AUDIO = "audio"
 43
 44
 45class OutputFormat(str, Enum):
 46    """The output format for extraction results."""
 47
 48    TEXT = "text/plain"
 49    MARKDOWN = "text/markdown"
 50
 51
 52class ExtractorType(str, Enum):
 53    """The type of extractor used to process documents."""
 54
 55    LITELLM = "litellm"
 56
 57
 58SUPPORTED_MIME_TYPES = {
 59    Kind.DOCUMENT: {
 60        "application/pdf",
 61        "text/plain",
 62        "text/markdown",
 63        "text/html",
 64        "text/md",
 65    },
 66    Kind.IMAGE: {
 67        "image/png",
 68        "image/jpeg",
 69    },
 70    Kind.VIDEO: {
 71        "video/mp4",
 72        "video/quicktime",
 73    },
 74    Kind.AUDIO: {
 75        "audio/wav",
 76        "audio/mpeg",
 77        "audio/ogg",
 78    },
 79}
 80
 81
 82class ExtractionModel(BaseModel):
 83    """A model available for document extraction."""
 84
 85    name: str = Field(description="The model identifier.")
 86    label: str = Field(description="A human-readable name for the model.")
 87
 88
 89class ExtractionSource(str, Enum):
 90    """Whether the document was processed by an extractor or passed through as-is."""
 91
 92    PROCESSED = "processed"
 93    PASSTHROUGH = "passthrough"
 94
 95
 96class Extraction(
 97    KilnParentedModel, KilnParentModel, parent_of={"chunked_documents": ChunkedDocument}
 98):
 99    """The result of extracting content from a document."""
100
101    source: ExtractionSource = Field(
102        description="The source of the extraction.",
103    )
104    extractor_config_id: ID_TYPE = Field(
105        description="The ID of the extractor config used to extract the data.",
106    )
107    output: KilnAttachmentModel = Field(
108        description="The extraction output.",
109    )
110
111    def parent_document(self) -> Union["Document", None]:
112        if self.parent is None or self.parent.__class__.__name__ != "Document":
113            return None
114        return self.parent  # type: ignore
115
116    async def output_content(self) -> str | None:
117        if not self.path:
118            raise ValueError(
119                "Failed to resolve the path of extraction output attachment because the extraction does not have a path."
120            )
121
122        full_path = self.output.resolve_path(self.path.parent)
123
124        try:
125            return await anyio.Path(full_path).read_text(encoding="utf-8")
126        except Exception as e:
127            logger.error(
128                f"Failed to read extraction output for {full_path}: {e}", exc_info=True
129            )
130            raise ValueError(f"Failed to read extraction output: {e}")
131
132    def chunked_documents(self, readonly: bool = False) -> list[ChunkedDocument]:
133        return super().chunked_documents(readonly=readonly)  # type: ignore
134
135
136class LitellmExtractorConfigProperties(TypedDict, total=True):
137    extractor_type: Literal[ExtractorType.LITELLM]
138    prompt_document: NonEmptyString
139    prompt_image: NonEmptyString
140    prompt_video: NonEmptyString
141    prompt_audio: NonEmptyString
142
143
144class ExtractorConfig(KilnParentedModel):
145    """Configuration for extracting content from documents using a specific model and prompts."""
146
147    name: FilenameString = Field(
148        description="A name to identify the extractor config.",
149    )
150    is_archived: bool = Field(
151        default=False,
152        description="Whether the extractor config is archived. Archived extractor configs are not shown in the UI and are not available for use.",
153    )
154    description: str | None = Field(
155        default=None, description="The description of the extractor config"
156    )
157    model_provider_name: str = Field(
158        description="The name of the model provider to use for the extractor config.",
159    )
160    model_name: str = Field(
161        description="The name of the model to use for the extractor config.",
162    )
163    output_format: OutputFormat = Field(
164        default=OutputFormat.MARKDOWN,
165        description="The format to use for the output.",
166    )
167    passthrough_mimetypes: list[OutputFormat] = Field(
168        default_factory=list,
169        description="If the mimetype is in this list, the extractor will not be used and the text content of the file will be returned as is.",
170    )
171    extractor_type: ExtractorType = Field(
172        description="This is used to determine the type of extractor to use.",
173    )
174    properties: LitellmExtractorConfigProperties = Field(
175        description="Properties to be used to execute the extractor config. This is extractor_type specific and should serialize to a json dict.",
176        # the discriminator refers to the properties->extractor_type key (not the extractor_type field on the parent model)
177        discriminator="extractor_type",
178    )
179
180    @model_validator(mode="before")
181    def upgrade_missing_discriminator_properties(
182        cls, data: dict, info: ValidationInfo
183    ) -> dict:
184        if not info.context or not info.context.get("loading_from_file", False):
185            # Not loading from file, so no need to upgrade
186            return data
187
188        if not isinstance(data, dict):
189            return data
190
191        # backward compatibility:
192        # - we originally did not have the extractor_type in the properties, so we need to add it here
193        # - we started wanted to have extractor_type in the properties to use pydantic's discriminated union feature
194        properties = data.get("properties", {})
195        if "extractor_type" not in properties:
196            # the extractor_type on the parent model is always there, we just need to add it to the properties
197            properties["extractor_type"] = data["extractor_type"]
198            data["properties"] = properties
199        return data
200
201    @model_validator(mode="after")
202    def ensure_extractor_type_matches_properties(self):
203        # sanity check to ensure the extractor_type matches the properties extractor_type
204        if self.extractor_type != self.properties["extractor_type"]:
205            raise ValueError(
206                f"Extractor type mismatch: {self.extractor_type} != {self.properties['extractor_type']}. This is a bug, please report it."
207            )
208        return self
209
210    @property
211    def litellm_properties(self) -> LitellmExtractorConfigProperties:
212        if self.properties["extractor_type"] != ExtractorType.LITELLM:
213            raise ValueError(
214                f"Litellm properties are only available for litellm extractor type. Got {self.properties.get('extractor_type')}"
215            )
216        return self.properties
217
218    # Workaround to return typed parent without importing Project
219    def parent_project(self) -> Union["Project", None]:
220        if self.parent is None or self.parent.__class__.__name__ != "Project":
221            return None
222        return self.parent  # type: ignore
223
224
225class FileInfo(BaseModel):
226    """Metadata about an uploaded file."""
227
228    filename: str = Field(description="The filename of the file")
229
230    size: int = Field(description="The size of the file in bytes")
231
232    mime_type: str = Field(description="The MIME type of the file")
233
234    attachment: KilnAttachmentModel = Field(
235        description="The attachment to the file",
236    )
237
238    @field_serializer("attachment")
239    def serialize_attachment(
240        self, attachment: KilnAttachmentModel, info: SerializationInfo
241    ) -> dict:
242        context = info.context or {}
243        context["filename_prefix"] = "attachment"
244        return attachment.model_dump(mode="json", context=context)
245
246    @field_validator("mime_type")
247    @classmethod
248    def validate_mime_type(cls, mime_type: str, info: ValidationInfo) -> str:
249        filename = info.data.get("filename") or ""
250
251        for mime_types in SUPPORTED_MIME_TYPES.values():
252            if mime_type in mime_types:
253                return mime_type
254        raise ValueError(f"MIME type is not supported: {mime_type} (for {filename})")
255
256
257class Document(
258    KilnParentedModel, KilnParentModel, parent_of={"extractions": Extraction}
259):
260    """A document uploaded to a project for extraction and RAG."""
261
262    name: FilenameString = Field(
263        description="A name to identify the document. Should not be changed after creation.",
264    )
265
266    # this field can be changed after creation
267    name_override: str | None = Field(
268        description="A friendly name to identify the document. This is used for display purposes and can be different from the name.",
269        default=None,
270    )
271
272    description: str = Field(description="A description for the file")
273
274    original_file: FileInfo = Field(description="The original file")
275
276    kind: Kind = Field(
277        description="The kind of document. The kind is a broad family of filetypes that can be handled in a similar way"
278    )
279
280    tags: List[str] = Field(
281        default_factory=list,
282        description="Tags for the document. Tags are used to categorize documents for filtering and reporting.",
283    )
284
285    @model_validator(mode="after")
286    def validate_tags(self) -> Self:
287        for tag in self.tags:
288            if not tag:
289                raise ValueError("Tags cannot be empty strings")
290            if " " in tag:
291                raise ValueError("Tags cannot contain spaces. Try underscores.")
292
293        return self
294
295    # Workaround to return typed parent without importing Project
296    def parent_project(self) -> Union["Project", None]:
297        if self.parent is None or self.parent.__class__.__name__ != "Project":
298            return None
299        return self.parent  # type: ignore
300
301    def extractions(self, readonly: bool = False) -> list[Extraction]:
302        return super().extractions(readonly=readonly)  # type: ignore
303
304    @computed_field
305    @property
306    def friendly_name(self) -> str:
307        # backward compatibility: old documents did not have name_override
308        return self.name_override or self.name
309
310
311def get_kind_from_mime_type(mime_type: str) -> Kind | None:
312    for kind, mime_types in SUPPORTED_MIME_TYPES.items():
313        if mime_type in mime_types:
314            return kind
315    return None
logger = <Logger kiln_ai.datamodel.extraction (WARNING)>
class Kind(builtins.str, enum.Enum):
37class Kind(str, Enum):
38    """The kind of content a document contains."""
39
40    DOCUMENT = "document"
41    IMAGE = "image"
42    VIDEO = "video"
43    AUDIO = "audio"

The kind of content a document contains.

DOCUMENT = <Kind.DOCUMENT: 'document'>
IMAGE = <Kind.IMAGE: 'image'>
VIDEO = <Kind.VIDEO: 'video'>
AUDIO = <Kind.AUDIO: 'audio'>
class OutputFormat(builtins.str, enum.Enum):
46class OutputFormat(str, Enum):
47    """The output format for extraction results."""
48
49    TEXT = "text/plain"
50    MARKDOWN = "text/markdown"

The output format for extraction results.

TEXT = <OutputFormat.TEXT: 'text/plain'>
MARKDOWN = <OutputFormat.MARKDOWN: 'text/markdown'>
class ExtractorType(builtins.str, enum.Enum):
53class ExtractorType(str, Enum):
54    """The type of extractor used to process documents."""
55
56    LITELLM = "litellm"

The type of extractor used to process documents.

LITELLM = <ExtractorType.LITELLM: 'litellm'>
SUPPORTED_MIME_TYPES = {<Kind.DOCUMENT: 'document'>: {'text/html', 'text/markdown', 'text/md', 'text/plain', 'application/pdf'}, <Kind.IMAGE: 'image'>: {'image/jpeg', 'image/png'}, <Kind.VIDEO: 'video'>: {'video/mp4', 'video/quicktime'}, <Kind.AUDIO: 'audio'>: {'audio/ogg', 'audio/mpeg', 'audio/wav'}}
class ExtractionModel(pydantic.main.BaseModel):
83class ExtractionModel(BaseModel):
84    """A model available for document extraction."""
85
86    name: str = Field(description="The model identifier.")
87    label: str = Field(description="A human-readable name for the model.")

A model available for document extraction.

name: str
label: str
model_config: ClassVar[pydantic.config.ConfigDict] = {}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

class ExtractionSource(builtins.str, enum.Enum):
90class ExtractionSource(str, Enum):
91    """Whether the document was processed by an extractor or passed through as-is."""
92
93    PROCESSED = "processed"
94    PASSTHROUGH = "passthrough"

Whether the document was processed by an extractor or passed through as-is.

PROCESSED = <ExtractionSource.PROCESSED: 'processed'>
PASSTHROUGH = <ExtractionSource.PASSTHROUGH: 'passthrough'>
class Extraction(kiln_ai.datamodel.basemodel.KilnParentedModel, kiln_ai.datamodel.basemodel.KilnParentModel):
 97class Extraction(
 98    KilnParentedModel, KilnParentModel, parent_of={"chunked_documents": ChunkedDocument}
 99):
100    """The result of extracting content from a document."""
101
102    source: ExtractionSource = Field(
103        description="The source of the extraction.",
104    )
105    extractor_config_id: ID_TYPE = Field(
106        description="The ID of the extractor config used to extract the data.",
107    )
108    output: KilnAttachmentModel = Field(
109        description="The extraction output.",
110    )
111
112    def parent_document(self) -> Union["Document", None]:
113        if self.parent is None or self.parent.__class__.__name__ != "Document":
114            return None
115        return self.parent  # type: ignore
116
117    async def output_content(self) -> str | None:
118        if not self.path:
119            raise ValueError(
120                "Failed to resolve the path of extraction output attachment because the extraction does not have a path."
121            )
122
123        full_path = self.output.resolve_path(self.path.parent)
124
125        try:
126            return await anyio.Path(full_path).read_text(encoding="utf-8")
127        except Exception as e:
128            logger.error(
129                f"Failed to read extraction output for {full_path}: {e}", exc_info=True
130            )
131            raise ValueError(f"Failed to read extraction output: {e}")
132
133    def chunked_documents(self, readonly: bool = False) -> list[ChunkedDocument]:
134        return super().chunked_documents(readonly=readonly)  # type: ignore

The result of extracting content from a document.

extractor_config_id: Optional[str]
output: kiln_ai.datamodel.basemodel.KilnAttachmentModel
def parent_document(self) -> Optional[Document]:
112    def parent_document(self) -> Union["Document", None]:
113        if self.parent is None or self.parent.__class__.__name__ != "Document":
114            return None
115        return self.parent  # type: ignore
async def output_content(self) -> str | None:
117    async def output_content(self) -> str | None:
118        if not self.path:
119            raise ValueError(
120                "Failed to resolve the path of extraction output attachment because the extraction does not have a path."
121            )
122
123        full_path = self.output.resolve_path(self.path.parent)
124
125        try:
126            return await anyio.Path(full_path).read_text(encoding="utf-8")
127        except Exception as e:
128            logger.error(
129                f"Failed to read extraction output for {full_path}: {e}", exc_info=True
130            )
131            raise ValueError(f"Failed to read extraction output: {e}")
def chunked_documents(self, readonly=False) -> List[kiln_ai.datamodel.chunk.ChunkedDocument]:
743        def child_method(self, readonly: bool = False) -> list[child_class]:  # type: ignore[invalid-type-form]
744            return child_class.all_children_of_parent_path(self.path, readonly=readonly)

The type of the None singleton.

def relationship_name() -> str:
761        def relationship_name_method() -> str:
762            return relationship_name

The type of the None singleton.

def parent_type() -> Type[kiln_ai.datamodel.basemodel.KilnParentModel]:
754        def parent_class_method() -> Type[KilnParentModel]:
755            return cls

The type of the None singleton.

model_config = {'validate_assignment': True}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

def model_post_init(self: pydantic.main.BaseModel, context: Any, /) -> None:
337def init_private_attributes(self: BaseModel, context: Any, /) -> None:
338    """This function is meant to behave like a BaseModel method to initialise private attributes.
339
340    It takes context as an argument since that's what pydantic-core passes when calling it.
341
342    Args:
343        self: The BaseModel instance.
344        context: The context.
345    """
346    if getattr(self, '__pydantic_private__', None) is None:
347        pydantic_private = {}
348        for name, private_attr in self.__private_attributes__.items():
349            default = private_attr.get_default()
350            if default is not PydanticUndefined:
351                pydantic_private[name] = default
352        object_setattr(self, '__pydantic_private__', pydantic_private)

This function is meant to behave like a BaseModel method to initialise private attributes.

It takes context as an argument since that's what pydantic-core passes when calling it.

Args: self: The BaseModel instance. context: The context.

class LitellmExtractorConfigProperties(typing_extensions.TypedDict):
137class LitellmExtractorConfigProperties(TypedDict, total=True):
138    extractor_type: Literal[ExtractorType.LITELLM]
139    prompt_document: NonEmptyString
140    prompt_image: NonEmptyString
141    prompt_video: NonEmptyString
142    prompt_audio: NonEmptyString
extractor_type: Literal[<ExtractorType.LITELLM: 'litellm'>]
prompt_document: Annotated[str, AfterValidator(func=<function string_not_empty at 0x7f90233e93a0>)]
prompt_image: Annotated[str, AfterValidator(func=<function string_not_empty at 0x7f90233e93a0>)]
prompt_video: Annotated[str, AfterValidator(func=<function string_not_empty at 0x7f90233e93a0>)]
prompt_audio: Annotated[str, AfterValidator(func=<function string_not_empty at 0x7f90233e93a0>)]
class ExtractorConfig(kiln_ai.datamodel.basemodel.KilnParentedModel):
145class ExtractorConfig(KilnParentedModel):
146    """Configuration for extracting content from documents using a specific model and prompts."""
147
148    name: FilenameString = Field(
149        description="A name to identify the extractor config.",
150    )
151    is_archived: bool = Field(
152        default=False,
153        description="Whether the extractor config is archived. Archived extractor configs are not shown in the UI and are not available for use.",
154    )
155    description: str | None = Field(
156        default=None, description="The description of the extractor config"
157    )
158    model_provider_name: str = Field(
159        description="The name of the model provider to use for the extractor config.",
160    )
161    model_name: str = Field(
162        description="The name of the model to use for the extractor config.",
163    )
164    output_format: OutputFormat = Field(
165        default=OutputFormat.MARKDOWN,
166        description="The format to use for the output.",
167    )
168    passthrough_mimetypes: list[OutputFormat] = Field(
169        default_factory=list,
170        description="If the mimetype is in this list, the extractor will not be used and the text content of the file will be returned as is.",
171    )
172    extractor_type: ExtractorType = Field(
173        description="This is used to determine the type of extractor to use.",
174    )
175    properties: LitellmExtractorConfigProperties = Field(
176        description="Properties to be used to execute the extractor config. This is extractor_type specific and should serialize to a json dict.",
177        # the discriminator refers to the properties->extractor_type key (not the extractor_type field on the parent model)
178        discriminator="extractor_type",
179    )
180
181    @model_validator(mode="before")
182    def upgrade_missing_discriminator_properties(
183        cls, data: dict, info: ValidationInfo
184    ) -> dict:
185        if not info.context or not info.context.get("loading_from_file", False):
186            # Not loading from file, so no need to upgrade
187            return data
188
189        if not isinstance(data, dict):
190            return data
191
192        # backward compatibility:
193        # - we originally did not have the extractor_type in the properties, so we need to add it here
194        # - we started wanted to have extractor_type in the properties to use pydantic's discriminated union feature
195        properties = data.get("properties", {})
196        if "extractor_type" not in properties:
197            # the extractor_type on the parent model is always there, we just need to add it to the properties
198            properties["extractor_type"] = data["extractor_type"]
199            data["properties"] = properties
200        return data
201
202    @model_validator(mode="after")
203    def ensure_extractor_type_matches_properties(self):
204        # sanity check to ensure the extractor_type matches the properties extractor_type
205        if self.extractor_type != self.properties["extractor_type"]:
206            raise ValueError(
207                f"Extractor type mismatch: {self.extractor_type} != {self.properties['extractor_type']}. This is a bug, please report it."
208            )
209        return self
210
211    @property
212    def litellm_properties(self) -> LitellmExtractorConfigProperties:
213        if self.properties["extractor_type"] != ExtractorType.LITELLM:
214            raise ValueError(
215                f"Litellm properties are only available for litellm extractor type. Got {self.properties.get('extractor_type')}"
216            )
217        return self.properties
218
219    # Workaround to return typed parent without importing Project
220    def parent_project(self) -> Union["Project", None]:
221        if self.parent is None or self.parent.__class__.__name__ != "Project":
222            return None
223        return self.parent  # type: ignore

Configuration for extracting content from documents using a specific model and prompts.

name: Annotated[str, BeforeValidator(func=<function name_validator.<locals>.fn at 0x7f90236f9b20>, json_schema_input_type=PydanticUndefined), StringConstraints(strip_whitespace=None, to_upper=None, to_lower=None, strict=None, min_length=1, max_length=120, pattern=None)]
is_archived: bool
description: str | None
model_provider_name: str
model_name: str
output_format: OutputFormat
passthrough_mimetypes: list[OutputFormat]
extractor_type: ExtractorType
@model_validator(mode='before')
def upgrade_missing_discriminator_properties(cls, data: dict, info: pydantic_core.core_schema.ValidationInfo) -> dict:
181    @model_validator(mode="before")
182    def upgrade_missing_discriminator_properties(
183        cls, data: dict, info: ValidationInfo
184    ) -> dict:
185        if not info.context or not info.context.get("loading_from_file", False):
186            # Not loading from file, so no need to upgrade
187            return data
188
189        if not isinstance(data, dict):
190            return data
191
192        # backward compatibility:
193        # - we originally did not have the extractor_type in the properties, so we need to add it here
194        # - we started wanted to have extractor_type in the properties to use pydantic's discriminated union feature
195        properties = data.get("properties", {})
196        if "extractor_type" not in properties:
197            # the extractor_type on the parent model is always there, we just need to add it to the properties
198            properties["extractor_type"] = data["extractor_type"]
199            data["properties"] = properties
200        return data
@model_validator(mode='after')
def ensure_extractor_type_matches_properties(self):
202    @model_validator(mode="after")
203    def ensure_extractor_type_matches_properties(self):
204        # sanity check to ensure the extractor_type matches the properties extractor_type
205        if self.extractor_type != self.properties["extractor_type"]:
206            raise ValueError(
207                f"Extractor type mismatch: {self.extractor_type} != {self.properties['extractor_type']}. This is a bug, please report it."
208            )
209        return self
litellm_properties: LitellmExtractorConfigProperties
211    @property
212    def litellm_properties(self) -> LitellmExtractorConfigProperties:
213        if self.properties["extractor_type"] != ExtractorType.LITELLM:
214            raise ValueError(
215                f"Litellm properties are only available for litellm extractor type. Got {self.properties.get('extractor_type')}"
216            )
217        return self.properties
def parent_project(self) -> Optional[kiln_ai.datamodel.Project]:
220    def parent_project(self) -> Union["Project", None]:
221        if self.parent is None or self.parent.__class__.__name__ != "Project":
222            return None
223        return self.parent  # type: ignore
def relationship_name() -> str:
761        def relationship_name_method() -> str:
762            return relationship_name

The type of the None singleton.

def parent_type() -> Type[kiln_ai.datamodel.basemodel.KilnParentModel]:
754        def parent_class_method() -> Type[KilnParentModel]:
755            return cls

The type of the None singleton.

model_config = {'validate_assignment': True}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

def model_post_init(self: pydantic.main.BaseModel, context: Any, /) -> None:
337def init_private_attributes(self: BaseModel, context: Any, /) -> None:
338    """This function is meant to behave like a BaseModel method to initialise private attributes.
339
340    It takes context as an argument since that's what pydantic-core passes when calling it.
341
342    Args:
343        self: The BaseModel instance.
344        context: The context.
345    """
346    if getattr(self, '__pydantic_private__', None) is None:
347        pydantic_private = {}
348        for name, private_attr in self.__private_attributes__.items():
349            default = private_attr.get_default()
350            if default is not PydanticUndefined:
351                pydantic_private[name] = default
352        object_setattr(self, '__pydantic_private__', pydantic_private)

This function is meant to behave like a BaseModel method to initialise private attributes.

It takes context as an argument since that's what pydantic-core passes when calling it.

Args: self: The BaseModel instance. context: The context.

class FileInfo(pydantic.main.BaseModel):
226class FileInfo(BaseModel):
227    """Metadata about an uploaded file."""
228
229    filename: str = Field(description="The filename of the file")
230
231    size: int = Field(description="The size of the file in bytes")
232
233    mime_type: str = Field(description="The MIME type of the file")
234
235    attachment: KilnAttachmentModel = Field(
236        description="The attachment to the file",
237    )
238
239    @field_serializer("attachment")
240    def serialize_attachment(
241        self, attachment: KilnAttachmentModel, info: SerializationInfo
242    ) -> dict:
243        context = info.context or {}
244        context["filename_prefix"] = "attachment"
245        return attachment.model_dump(mode="json", context=context)
246
247    @field_validator("mime_type")
248    @classmethod
249    def validate_mime_type(cls, mime_type: str, info: ValidationInfo) -> str:
250        filename = info.data.get("filename") or ""
251
252        for mime_types in SUPPORTED_MIME_TYPES.values():
253            if mime_type in mime_types:
254                return mime_type
255        raise ValueError(f"MIME type is not supported: {mime_type} (for {filename})")

Metadata about an uploaded file.

filename: str
size: int
mime_type: str
attachment: kiln_ai.datamodel.basemodel.KilnAttachmentModel
@field_serializer('attachment')
def serialize_attachment( self, attachment: kiln_ai.datamodel.basemodel.KilnAttachmentModel, info: pydantic_core.core_schema.SerializationInfo) -> dict:
239    @field_serializer("attachment")
240    def serialize_attachment(
241        self, attachment: KilnAttachmentModel, info: SerializationInfo
242    ) -> dict:
243        context = info.context or {}
244        context["filename_prefix"] = "attachment"
245        return attachment.model_dump(mode="json", context=context)
@field_validator('mime_type')
@classmethod
def validate_mime_type( cls, mime_type: str, info: pydantic_core.core_schema.ValidationInfo) -> str:
247    @field_validator("mime_type")
248    @classmethod
249    def validate_mime_type(cls, mime_type: str, info: ValidationInfo) -> str:
250        filename = info.data.get("filename") or ""
251
252        for mime_types in SUPPORTED_MIME_TYPES.values():
253            if mime_type in mime_types:
254                return mime_type
255        raise ValueError(f"MIME type is not supported: {mime_type} (for {filename})")
model_config: ClassVar[pydantic.config.ConfigDict] = {}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

class Document(kiln_ai.datamodel.basemodel.KilnParentedModel, kiln_ai.datamodel.basemodel.KilnParentModel):
258class Document(
259    KilnParentedModel, KilnParentModel, parent_of={"extractions": Extraction}
260):
261    """A document uploaded to a project for extraction and RAG."""
262
263    name: FilenameString = Field(
264        description="A name to identify the document. Should not be changed after creation.",
265    )
266
267    # this field can be changed after creation
268    name_override: str | None = Field(
269        description="A friendly name to identify the document. This is used for display purposes and can be different from the name.",
270        default=None,
271    )
272
273    description: str = Field(description="A description for the file")
274
275    original_file: FileInfo = Field(description="The original file")
276
277    kind: Kind = Field(
278        description="The kind of document. The kind is a broad family of filetypes that can be handled in a similar way"
279    )
280
281    tags: List[str] = Field(
282        default_factory=list,
283        description="Tags for the document. Tags are used to categorize documents for filtering and reporting.",
284    )
285
286    @model_validator(mode="after")
287    def validate_tags(self) -> Self:
288        for tag in self.tags:
289            if not tag:
290                raise ValueError("Tags cannot be empty strings")
291            if " " in tag:
292                raise ValueError("Tags cannot contain spaces. Try underscores.")
293
294        return self
295
296    # Workaround to return typed parent without importing Project
297    def parent_project(self) -> Union["Project", None]:
298        if self.parent is None or self.parent.__class__.__name__ != "Project":
299            return None
300        return self.parent  # type: ignore
301
302    def extractions(self, readonly: bool = False) -> list[Extraction]:
303        return super().extractions(readonly=readonly)  # type: ignore
304
305    @computed_field
306    @property
307    def friendly_name(self) -> str:
308        # backward compatibility: old documents did not have name_override
309        return self.name_override or self.name

A document uploaded to a project for extraction and RAG.

name: Annotated[str, BeforeValidator(func=<function name_validator.<locals>.fn at 0x7f90236f9b20>, json_schema_input_type=PydanticUndefined), StringConstraints(strip_whitespace=None, to_upper=None, to_lower=None, strict=None, min_length=1, max_length=120, pattern=None)]
name_override: str | None
description: str
original_file: FileInfo
kind: Kind
tags: List[str]
@model_validator(mode='after')
def validate_tags(self) -> Self:
286    @model_validator(mode="after")
287    def validate_tags(self) -> Self:
288        for tag in self.tags:
289            if not tag:
290                raise ValueError("Tags cannot be empty strings")
291            if " " in tag:
292                raise ValueError("Tags cannot contain spaces. Try underscores.")
293
294        return self
def parent_project(self) -> Optional[kiln_ai.datamodel.Project]:
297    def parent_project(self) -> Union["Project", None]:
298        if self.parent is None or self.parent.__class__.__name__ != "Project":
299            return None
300        return self.parent  # type: ignore
def extractions(self, readonly=False) -> List[Extraction]:
743        def child_method(self, readonly: bool = False) -> list[child_class]:  # type: ignore[invalid-type-form]
744            return child_class.all_children_of_parent_path(self.path, readonly=readonly)

The type of the None singleton.

friendly_name: str
305    @computed_field
306    @property
307    def friendly_name(self) -> str:
308        # backward compatibility: old documents did not have name_override
309        return self.name_override or self.name
def relationship_name() -> str:
761        def relationship_name_method() -> str:
762            return relationship_name

The type of the None singleton.

def parent_type() -> Type[kiln_ai.datamodel.basemodel.KilnParentModel]:
754        def parent_class_method() -> Type[KilnParentModel]:
755            return cls

The type of the None singleton.

model_config = {'validate_assignment': True}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

def model_post_init(self: pydantic.main.BaseModel, context: Any, /) -> None:
337def init_private_attributes(self: BaseModel, context: Any, /) -> None:
338    """This function is meant to behave like a BaseModel method to initialise private attributes.
339
340    It takes context as an argument since that's what pydantic-core passes when calling it.
341
342    Args:
343        self: The BaseModel instance.
344        context: The context.
345    """
346    if getattr(self, '__pydantic_private__', None) is None:
347        pydantic_private = {}
348        for name, private_attr in self.__private_attributes__.items():
349            default = private_attr.get_default()
350            if default is not PydanticUndefined:
351                pydantic_private[name] = default
352        object_setattr(self, '__pydantic_private__', pydantic_private)

This function is meant to behave like a BaseModel method to initialise private attributes.

It takes context as an argument since that's what pydantic-core passes when calling it.

Args: self: The BaseModel instance. context: The context.

def get_kind_from_mime_type(mime_type: str) -> Kind | None:
312def get_kind_from_mime_type(mime_type: str) -> Kind | None:
313    for kind, mime_types in SUPPORTED_MIME_TYPES.items():
314        if mime_type in mime_types:
315            return kind
316    return None