kiln_ai.datamodel.extraction

View Source

  1import logging
  2from enum import Enum
  3from typing import TYPE_CHECKING, List, Literal, Union
  4
  5import anyio
  6from pydantic import (
  7    BaseModel,
  8    Field,
  9    SerializationInfo,
 10    ValidationInfo,
 11    computed_field,
 12    field_serializer,
 13    field_validator,
 14    model_validator,
 15)
 16from typing_extensions import Self, TypedDict
 17
 18from kiln_ai.datamodel.basemodel import (
 19    ID_TYPE,
 20    FilenameString,
 21    KilnAttachmentModel,
 22    KilnParentedModel,
 23    KilnParentModel,
 24)
 25from kiln_ai.datamodel.chunk import ChunkedDocument
 26from kiln_ai.utils.validation import NonEmptyString
 27
 28logger = logging.getLogger(__name__)
 29
 30if TYPE_CHECKING:
 31    from kiln_ai.datamodel.project import Project
 32
 33logger = logging.getLogger(__name__)
 34
 35
 36class Kind(str, Enum):
 37    DOCUMENT = "document"
 38    IMAGE = "image"
 39    VIDEO = "video"
 40    AUDIO = "audio"
 41
 42
 43class OutputFormat(str, Enum):
 44    TEXT = "text/plain"
 45    MARKDOWN = "text/markdown"
 46
 47
 48class ExtractorType(str, Enum):
 49    LITELLM = "litellm"
 50
 51
 52SUPPORTED_MIME_TYPES = {
 53    Kind.DOCUMENT: {
 54        "application/pdf",
 55        "text/plain",
 56        "text/markdown",
 57        "text/html",
 58        "text/md",
 59    },
 60    Kind.IMAGE: {
 61        "image/png",
 62        "image/jpeg",
 63    },
 64    Kind.VIDEO: {
 65        "video/mp4",
 66        "video/quicktime",
 67    },
 68    Kind.AUDIO: {
 69        "audio/wav",
 70        "audio/mpeg",
 71        "audio/ogg",
 72    },
 73}
 74
 75
 76class ExtractionModel(BaseModel):
 77    name: str
 78    label: str
 79
 80
 81class ExtractionSource(str, Enum):
 82    PROCESSED = "processed"
 83    PASSTHROUGH = "passthrough"
 84
 85
 86class Extraction(
 87    KilnParentedModel, KilnParentModel, parent_of={"chunked_documents": ChunkedDocument}
 88):
 89    source: ExtractionSource = Field(
 90        description="The source of the extraction.",
 91    )
 92    extractor_config_id: ID_TYPE = Field(
 93        description="The ID of the extractor config used to extract the data.",
 94    )
 95    output: KilnAttachmentModel = Field(
 96        description="The extraction output.",
 97    )
 98
 99    def parent_document(self) -> Union["Document", None]:
100        if self.parent is None or self.parent.__class__.__name__ != "Document":
101            return None
102        return self.parent  # type: ignore
103
104    async def output_content(self) -> str | None:
105        if not self.path:
106            raise ValueError(
107                "Failed to resolve the path of extraction output attachment because the extraction does not have a path."
108            )
109
110        full_path = self.output.resolve_path(self.path.parent)
111
112        try:
113            return await anyio.Path(full_path).read_text(encoding="utf-8")
114        except Exception as e:
115            logger.error(
116                f"Failed to read extraction output for {full_path}: {e}", exc_info=True
117            )
118            raise ValueError(f"Failed to read extraction output: {e}")
119
120    def chunked_documents(self, readonly: bool = False) -> list[ChunkedDocument]:
121        return super().chunked_documents(readonly=readonly)  # type: ignore
122
123
124class LitellmExtractorConfigProperties(TypedDict, total=True):
125    extractor_type: Literal[ExtractorType.LITELLM]
126    prompt_document: NonEmptyString
127    prompt_image: NonEmptyString
128    prompt_video: NonEmptyString
129    prompt_audio: NonEmptyString
130
131
132class ExtractorConfig(KilnParentedModel):
133    name: FilenameString = Field(
134        description="A name to identify the extractor config.",
135    )
136    is_archived: bool = Field(
137        default=False,
138        description="Whether the extractor config is archived. Archived extractor configs are not shown in the UI and are not available for use.",
139    )
140    description: str | None = Field(
141        default=None, description="The description of the extractor config"
142    )
143    model_provider_name: str = Field(
144        description="The name of the model provider to use for the extractor config.",
145    )
146    model_name: str = Field(
147        description="The name of the model to use for the extractor config.",
148    )
149    output_format: OutputFormat = Field(
150        default=OutputFormat.MARKDOWN,
151        description="The format to use for the output.",
152    )
153    passthrough_mimetypes: list[OutputFormat] = Field(
154        default_factory=list,
155        description="If the mimetype is in this list, the extractor will not be used and the text content of the file will be returned as is.",
156    )
157    extractor_type: ExtractorType = Field(
158        description="This is used to determine the type of extractor to use.",
159    )
160    properties: LitellmExtractorConfigProperties = Field(
161        description="Properties to be used to execute the extractor config. This is extractor_type specific and should serialize to a json dict.",
162        # the discriminator refers to the properties->extractor_type key (not the extractor_type field on the parent model)
163        discriminator="extractor_type",
164    )
165
166    @model_validator(mode="before")
167    def upgrade_missing_discriminator_properties(
168        cls, data: dict, info: ValidationInfo
169    ) -> dict:
170        if not info.context or not info.context.get("loading_from_file", False):
171            # Not loading from file, so no need to upgrade
172            return data
173
174        if not isinstance(data, dict):
175            return data
176
177        # backward compatibility:
178        # - we originally did not have the extractor_type in the properties, so we need to add it here
179        # - we started wanted to have extractor_type in the properties to use pydantic's discriminated union feature
180        properties = data.get("properties", {})
181        if "extractor_type" not in properties:
182            # the extractor_type on the parent model is always there, we just need to add it to the properties
183            properties["extractor_type"] = data["extractor_type"]
184            data["properties"] = properties
185        return data
186
187    @model_validator(mode="after")
188    def ensure_extractor_type_matches_properties(self):
189        # sanity check to ensure the extractor_type matches the properties extractor_type
190        if self.extractor_type != self.properties["extractor_type"]:
191            raise ValueError(
192                f"Extractor type mismatch: {self.extractor_type} != {self.properties['extractor_type']}. This is a bug, please report it."
193            )
194        return self
195
196    @property
197    def litellm_properties(self) -> LitellmExtractorConfigProperties:
198        if self.properties["extractor_type"] != ExtractorType.LITELLM:
199            raise ValueError(
200                f"Litellm properties are only available for litellm extractor type. Got {self.properties.get('extractor_type')}"
201            )
202        return self.properties
203
204    # Workaround to return typed parent without importing Project
205    def parent_project(self) -> Union["Project", None]:
206        if self.parent is None or self.parent.__class__.__name__ != "Project":
207            return None
208        return self.parent  # type: ignore
209
210
211class FileInfo(BaseModel):
212    filename: str = Field(description="The filename of the file")
213
214    size: int = Field(description="The size of the file in bytes")
215
216    mime_type: str = Field(description="The MIME type of the file")
217
218    attachment: KilnAttachmentModel = Field(
219        description="The attachment to the file",
220    )
221
222    @field_serializer("attachment")
223    def serialize_attachment(
224        self, attachment: KilnAttachmentModel, info: SerializationInfo
225    ) -> dict:
226        context = info.context or {}
227        context["filename_prefix"] = "attachment"
228        return attachment.model_dump(mode="json", context=context)
229
230    @field_validator("mime_type")
231    @classmethod
232    def validate_mime_type(cls, mime_type: str, info: ValidationInfo) -> str:
233        filename = info.data.get("filename") or ""
234
235        for mime_types in SUPPORTED_MIME_TYPES.values():
236            if mime_type in mime_types:
237                return mime_type
238        raise ValueError(f"MIME type is not supported: {mime_type} (for {filename})")
239
240
241class Document(
242    KilnParentedModel, KilnParentModel, parent_of={"extractions": Extraction}
243):
244    # this field should not be changed after creation
245    name: FilenameString = Field(
246        description="A name to identify the document.",
247    )
248
249    # this field can be changed after creation
250    name_override: str | None = Field(
251        description="A friendly name to identify the document. This is used for display purposes and can be different from the name.",
252        default=None,
253    )
254
255    description: str = Field(description="A description for the file")
256
257    original_file: FileInfo = Field(description="The original file")
258
259    kind: Kind = Field(
260        description="The kind of document. The kind is a broad family of filetypes that can be handled in a similar way"
261    )
262
263    tags: List[str] = Field(
264        default_factory=list,
265        description="Tags for the document. Tags are used to categorize documents for filtering and reporting.",
266    )
267
268    @model_validator(mode="after")
269    def validate_tags(self) -> Self:
270        for tag in self.tags:
271            if not tag:
272                raise ValueError("Tags cannot be empty strings")
273            if " " in tag:
274                raise ValueError("Tags cannot contain spaces. Try underscores.")
275
276        return self
277
278    # Workaround to return typed parent without importing Project
279    def parent_project(self) -> Union["Project", None]:
280        if self.parent is None or self.parent.__class__.__name__ != "Project":
281            return None
282        return self.parent  # type: ignore
283
284    def extractions(self, readonly: bool = False) -> list[Extraction]:
285        return super().extractions(readonly=readonly)  # type: ignore
286
287    @computed_field
288    @property
289    def friendly_name(self) -> str:
290        # backward compatibility: old documents did not have name_override
291        return self.name_override or self.name
292
293
294def get_kind_from_mime_type(mime_type: str) -> Kind | None:
295    for kind, mime_types in SUPPORTED_MIME_TYPES.items():
296        if mime_type in mime_types:
297            return kind
298    return None

logger = <Logger kiln_ai.datamodel.extraction (WARNING)>

class Kind(builtins.str, enum.Enum): View Source

37class Kind(str, Enum):
38    DOCUMENT = "document"
39    IMAGE = "image"
40    VIDEO = "video"
41    AUDIO = "audio"

str(object='') -> str str(bytes_or_buffer[, encoding[, errors]]) -> str

Create a new string object from the given object. If encoding or errors is specified, then the object must expose a data buffer that will be decoded using the given encoding and error handler. Otherwise, returns the result of object.__str__() (if defined) or repr(object). encoding defaults to 'utf-8'. errors defaults to 'strict'.

DOCUMENT = <Kind.DOCUMENT: 'document'>

IMAGE = <Kind.IMAGE: 'image'>

VIDEO = <Kind.VIDEO: 'video'>

AUDIO = <Kind.AUDIO: 'audio'>

class OutputFormat(builtins.str, enum.Enum): View Source

44class OutputFormat(str, Enum):
45    TEXT = "text/plain"
46    MARKDOWN = "text/markdown"

str(object='') -> str str(bytes_or_buffer[, encoding[, errors]]) -> str

TEXT = <OutputFormat.TEXT: 'text/plain'>

MARKDOWN = <OutputFormat.MARKDOWN: 'text/markdown'>

class ExtractorType(builtins.str, enum.Enum): View Source

49class ExtractorType(str, Enum):
50    LITELLM = "litellm"

str(object='') -> str str(bytes_or_buffer[, encoding[, errors]]) -> str

LITELLM = <ExtractorType.LITELLM: 'litellm'>

SUPPORTED_MIME_TYPES = {<Kind.DOCUMENT: 'document'>: {'text/md', 'text/markdown', 'text/plain', 'application/pdf', 'text/html'}, <Kind.IMAGE: 'image'>: {'image/png', 'image/jpeg'}, <Kind.VIDEO: 'video'>: {'video/quicktime', 'video/mp4'}, <Kind.AUDIO: 'audio'>: {'audio/ogg', 'audio/wav', 'audio/mpeg'}}

class ExtractionModel(pydantic.main.BaseModel): View Source

77class ExtractionModel(BaseModel):
78    name: str
79    label: str

!!! abstract "Usage Documentation" Models

A base class for creating Pydantic models.

Attributes: __class_vars__: The names of the class variables defined on the model. __private_attributes__: Metadata about the private attributes of the model. __signature__: The synthesized __init__ [Signature][inspect.Signature] of the model.

__pydantic_complete__: Whether model building is completed, or if there are still undefined fields.
__pydantic_core_schema__: The core schema of the model.
__pydantic_custom_init__: Whether the model has a custom `__init__` function.
__pydantic_decorators__: Metadata containing the decorators defined on the model.
    This replaces `Model.__validators__` and `Model.__root_validators__` from Pydantic V1.
__pydantic_generic_metadata__: Metadata for generic models; contains data used for a similar purpose to
    __args__, __origin__, __parameters__ in typing-module generics. May eventually be replaced by these.
__pydantic_parent_namespace__: Parent namespace of the model, used for automatic rebuilding of models.
__pydantic_post_init__: The name of the post-init method for the model, if defined.
__pydantic_root_model__: Whether the model is a [`RootModel`][pydantic.root_model.RootModel].
__pydantic_serializer__: The `pydantic-core` `SchemaSerializer` used to dump instances of the model.
__pydantic_validator__: The `pydantic-core` `SchemaValidator` used to validate instances of the model.

__pydantic_fields__: A dictionary of field names and their corresponding [`FieldInfo`][pydantic.fields.FieldInfo] objects.
__pydantic_computed_fields__: A dictionary of computed field names and their corresponding [`ComputedFieldInfo`][pydantic.fields.ComputedFieldInfo] objects.

__pydantic_extra__: A dictionary containing extra values, if [`extra`][pydantic.config.ConfigDict.extra]
    is set to `'allow'`.
__pydantic_fields_set__: The names of fields explicitly set during instantiation.
__pydantic_private__: Values of private attributes set on the model instance.

name: str

label: str

model_config: ClassVar[pydantic.config.ConfigDict] = {}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

class ExtractionSource(builtins.str, enum.Enum): View Source

82class ExtractionSource(str, Enum):
83    PROCESSED = "processed"
84    PASSTHROUGH = "passthrough"

str(object='') -> str str(bytes_or_buffer[, encoding[, errors]]) -> str

PROCESSED = <ExtractionSource.PROCESSED: 'processed'>

PASSTHROUGH = <ExtractionSource.PASSTHROUGH: 'passthrough'>

class Extraction(kiln_ai.datamodel.basemodel.KilnParentedModel, kiln_ai.datamodel.basemodel.KilnParentModel): View Source

 87class Extraction(
 88    KilnParentedModel, KilnParentModel, parent_of={"chunked_documents": ChunkedDocument}
 89):
 90    source: ExtractionSource = Field(
 91        description="The source of the extraction.",
 92    )
 93    extractor_config_id: ID_TYPE = Field(
 94        description="The ID of the extractor config used to extract the data.",
 95    )
 96    output: KilnAttachmentModel = Field(
 97        description="The extraction output.",
 98    )
 99
100    def parent_document(self) -> Union["Document", None]:
101        if self.parent is None or self.parent.__class__.__name__ != "Document":
102            return None
103        return self.parent  # type: ignore
104
105    async def output_content(self) -> str | None:
106        if not self.path:
107            raise ValueError(
108                "Failed to resolve the path of extraction output attachment because the extraction does not have a path."
109            )
110
111        full_path = self.output.resolve_path(self.path.parent)
112
113        try:
114            return await anyio.Path(full_path).read_text(encoding="utf-8")
115        except Exception as e:
116            logger.error(
117                f"Failed to read extraction output for {full_path}: {e}", exc_info=True
118            )
119            raise ValueError(f"Failed to read extraction output: {e}")
120
121    def chunked_documents(self, readonly: bool = False) -> list[ChunkedDocument]:
122        return super().chunked_documents(readonly=readonly)  # type: ignore

Base model for Kiln models that have a parent-child relationship. This base class is for child models.

This class provides functionality for managing hierarchical relationships between models, including parent reference handling and file system organization.

Attributes: parent (KilnBaseModel): Reference to the parent model instance. Not persisted, just in memory.

source: ExtractionSource

extractor_config_id: Optional[str]

output: kiln_ai.datamodel.basemodel.KilnAttachmentModel

def parent_document(self) -> Optional[Document]: View Source

100    def parent_document(self) -> Union["Document", None]:
101        if self.parent is None or self.parent.__class__.__name__ != "Document":
102            return None
103        return self.parent  # type: ignore

async def output_content(self) -> str | None: View Source

105    async def output_content(self) -> str | None:
106        if not self.path:
107            raise ValueError(
108                "Failed to resolve the path of extraction output attachment because the extraction does not have a path."
109            )
110
111        full_path = self.output.resolve_path(self.path.parent)
112
113        try:
114            return await anyio.Path(full_path).read_text(encoding="utf-8")
115        except Exception as e:
116            logger.error(
117                f"Failed to read extraction output for {full_path}: {e}", exc_info=True
118            )
119            raise ValueError(f"Failed to read extraction output: {e}")

def chunked_documents(self, readonly=False) -> List[kiln_ai.datamodel.chunk.ChunkedDocument]: View Source

695        def child_method(self, readonly: bool = False) -> list[child_class]:
696            return child_class.all_children_of_parent_path(self.path, readonly=readonly)

The type of the None singleton.

def relationship_name() -> str: View Source

713        def relationship_name_method() -> str:
714            return relationship_name

The type of the None singleton.

def parent_type() -> Type[kiln_ai.datamodel.basemodel.KilnParentModel]: View Source

706        def parent_class_method() -> Type[KilnParentModel]:
707            return cls

The type of the None singleton.

model_config = {'validate_assignment': True}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

def model_post_init(self: pydantic.main.BaseModel, context: Any, /) -> None: View Source

337def init_private_attributes(self: BaseModel, context: Any, /) -> None:
338    """This function is meant to behave like a BaseModel method to initialise private attributes.
339
340    It takes context as an argument since that's what pydantic-core passes when calling it.
341
342    Args:
343        self: The BaseModel instance.
344        context: The context.
345    """
346    if getattr(self, '__pydantic_private__', None) is None:
347        pydantic_private = {}
348        for name, private_attr in self.__private_attributes__.items():
349            default = private_attr.get_default()
350            if default is not PydanticUndefined:
351                pydantic_private[name] = default
352        object_setattr(self, '__pydantic_private__', pydantic_private)

This function is meant to behave like a BaseModel method to initialise private attributes.

It takes context as an argument since that's what pydantic-core passes when calling it.

Args: self: The BaseModel instance. context: The context.

class LitellmExtractorConfigProperties(typing_extensions.TypedDict): View Source

125class LitellmExtractorConfigProperties(TypedDict, total=True):
126    extractor_type: Literal[ExtractorType.LITELLM]
127    prompt_document: NonEmptyString
128    prompt_image: NonEmptyString
129    prompt_video: NonEmptyString
130    prompt_audio: NonEmptyString

extractor_type: Literal[<ExtractorType.LITELLM: 'litellm'>]

prompt_document: Annotated[str, AfterValidator(func=<function string_not_empty at 0x7f2f1dc82fc0>)]

prompt_image: Annotated[str, AfterValidator(func=<function string_not_empty at 0x7f2f1dc82fc0>)]

prompt_video: Annotated[str, AfterValidator(func=<function string_not_empty at 0x7f2f1dc82fc0>)]

prompt_audio: Annotated[str, AfterValidator(func=<function string_not_empty at 0x7f2f1dc82fc0>)]

class ExtractorConfig(kiln_ai.datamodel.basemodel.KilnParentedModel): View Source

133class ExtractorConfig(KilnParentedModel):
134    name: FilenameString = Field(
135        description="A name to identify the extractor config.",
136    )
137    is_archived: bool = Field(
138        default=False,
139        description="Whether the extractor config is archived. Archived extractor configs are not shown in the UI and are not available for use.",
140    )
141    description: str | None = Field(
142        default=None, description="The description of the extractor config"
143    )
144    model_provider_name: str = Field(
145        description="The name of the model provider to use for the extractor config.",
146    )
147    model_name: str = Field(
148        description="The name of the model to use for the extractor config.",
149    )
150    output_format: OutputFormat = Field(
151        default=OutputFormat.MARKDOWN,
152        description="The format to use for the output.",
153    )
154    passthrough_mimetypes: list[OutputFormat] = Field(
155        default_factory=list,
156        description="If the mimetype is in this list, the extractor will not be used and the text content of the file will be returned as is.",
157    )
158    extractor_type: ExtractorType = Field(
159        description="This is used to determine the type of extractor to use.",
160    )
161    properties: LitellmExtractorConfigProperties = Field(
162        description="Properties to be used to execute the extractor config. This is extractor_type specific and should serialize to a json dict.",
163        # the discriminator refers to the properties->extractor_type key (not the extractor_type field on the parent model)
164        discriminator="extractor_type",
165    )
166
167    @model_validator(mode="before")
168    def upgrade_missing_discriminator_properties(
169        cls, data: dict, info: ValidationInfo
170    ) -> dict:
171        if not info.context or not info.context.get("loading_from_file", False):
172            # Not loading from file, so no need to upgrade
173            return data
174
175        if not isinstance(data, dict):
176            return data
177
178        # backward compatibility:
179        # - we originally did not have the extractor_type in the properties, so we need to add it here
180        # - we started wanted to have extractor_type in the properties to use pydantic's discriminated union feature
181        properties = data.get("properties", {})
182        if "extractor_type" not in properties:
183            # the extractor_type on the parent model is always there, we just need to add it to the properties
184            properties["extractor_type"] = data["extractor_type"]
185            data["properties"] = properties
186        return data
187
188    @model_validator(mode="after")
189    def ensure_extractor_type_matches_properties(self):
190        # sanity check to ensure the extractor_type matches the properties extractor_type
191        if self.extractor_type != self.properties["extractor_type"]:
192            raise ValueError(
193                f"Extractor type mismatch: {self.extractor_type} != {self.properties['extractor_type']}. This is a bug, please report it."
194            )
195        return self
196
197    @property
198    def litellm_properties(self) -> LitellmExtractorConfigProperties:
199        if self.properties["extractor_type"] != ExtractorType.LITELLM:
200            raise ValueError(
201                f"Litellm properties are only available for litellm extractor type. Got {self.properties.get('extractor_type')}"
202            )
203        return self.properties
204
205    # Workaround to return typed parent without importing Project
206    def parent_project(self) -> Union["Project", None]:
207        if self.parent is None or self.parent.__class__.__name__ != "Project":
208            return None
209        return self.parent  # type: ignore

Base model for Kiln models that have a parent-child relationship. This base class is for child models.

This class provides functionality for managing hierarchical relationships between models, including parent reference handling and file system organization.

Attributes: parent (KilnBaseModel): Reference to the parent model instance. Not persisted, just in memory.

name: Annotated[str, BeforeValidator(func=<function name_validator.<locals>.fn at 0x7f2f1ec0c9a0>, json_schema_input_type=PydanticUndefined)]

is_archived: bool

description: str | None

model_provider_name: str

model_name: str

output_format: OutputFormat

passthrough_mimetypes: list[OutputFormat]

extractor_type: ExtractorType

properties: LitellmExtractorConfigProperties

@model_validator(mode='before')

def upgrade_missing_discriminator_properties(cls, data: dict, info: pydantic_core.core_schema.ValidationInfo) -> dict: View Source

167    @model_validator(mode="before")
168    def upgrade_missing_discriminator_properties(
169        cls, data: dict, info: ValidationInfo
170    ) -> dict:
171        if not info.context or not info.context.get("loading_from_file", False):
172            # Not loading from file, so no need to upgrade
173            return data
174
175        if not isinstance(data, dict):
176            return data
177
178        # backward compatibility:
179        # - we originally did not have the extractor_type in the properties, so we need to add it here
180        # - we started wanted to have extractor_type in the properties to use pydantic's discriminated union feature
181        properties = data.get("properties", {})
182        if "extractor_type" not in properties:
183            # the extractor_type on the parent model is always there, we just need to add it to the properties
184            properties["extractor_type"] = data["extractor_type"]
185            data["properties"] = properties
186        return data

@model_validator(mode='after')

def ensure_extractor_type_matches_properties(self): View Source

188    @model_validator(mode="after")
189    def ensure_extractor_type_matches_properties(self):
190        # sanity check to ensure the extractor_type matches the properties extractor_type
191        if self.extractor_type != self.properties["extractor_type"]:
192            raise ValueError(
193                f"Extractor type mismatch: {self.extractor_type} != {self.properties['extractor_type']}. This is a bug, please report it."
194            )
195        return self

litellm_properties: LitellmExtractorConfigProperties View Source

197    @property
198    def litellm_properties(self) -> LitellmExtractorConfigProperties:
199        if self.properties["extractor_type"] != ExtractorType.LITELLM:
200            raise ValueError(
201                f"Litellm properties are only available for litellm extractor type. Got {self.properties.get('extractor_type')}"
202            )
203        return self.properties

def parent_project(self) -> Optional[kiln_ai.datamodel.Project]: View Source

206    def parent_project(self) -> Union["Project", None]:
207        if self.parent is None or self.parent.__class__.__name__ != "Project":
208            return None
209        return self.parent  # type: ignore

def relationship_name() -> str: View Source

713        def relationship_name_method() -> str:
714            return relationship_name

The type of the None singleton.

def parent_type() -> Type[kiln_ai.datamodel.basemodel.KilnParentModel]: View Source

706        def parent_class_method() -> Type[KilnParentModel]:
707            return cls

The type of the None singleton.

model_config = {'validate_assignment': True}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

def model_post_init(self: pydantic.main.BaseModel, context: Any, /) -> None: View Source

337def init_private_attributes(self: BaseModel, context: Any, /) -> None:
338    """This function is meant to behave like a BaseModel method to initialise private attributes.
339
340    It takes context as an argument since that's what pydantic-core passes when calling it.
341
342    Args:
343        self: The BaseModel instance.
344        context: The context.
345    """
346    if getattr(self, '__pydantic_private__', None) is None:
347        pydantic_private = {}
348        for name, private_attr in self.__private_attributes__.items():
349            default = private_attr.get_default()
350            if default is not PydanticUndefined:
351                pydantic_private[name] = default
352        object_setattr(self, '__pydantic_private__', pydantic_private)

This function is meant to behave like a BaseModel method to initialise private attributes.

It takes context as an argument since that's what pydantic-core passes when calling it.

Args: self: The BaseModel instance. context: The context.

class FileInfo(pydantic.main.BaseModel): View Source

212class FileInfo(BaseModel):
213    filename: str = Field(description="The filename of the file")
214
215    size: int = Field(description="The size of the file in bytes")
216
217    mime_type: str = Field(description="The MIME type of the file")
218
219    attachment: KilnAttachmentModel = Field(
220        description="The attachment to the file",
221    )
222
223    @field_serializer("attachment")
224    def serialize_attachment(
225        self, attachment: KilnAttachmentModel, info: SerializationInfo
226    ) -> dict:
227        context = info.context or {}
228        context["filename_prefix"] = "attachment"
229        return attachment.model_dump(mode="json", context=context)
230
231    @field_validator("mime_type")
232    @classmethod
233    def validate_mime_type(cls, mime_type: str, info: ValidationInfo) -> str:
234        filename = info.data.get("filename") or ""
235
236        for mime_types in SUPPORTED_MIME_TYPES.values():
237            if mime_type in mime_types:
238                return mime_type
239        raise ValueError(f"MIME type is not supported: {mime_type} (for {filename})")

!!! abstract "Usage Documentation" Models

A base class for creating Pydantic models.

__pydantic_complete__: Whether model building is completed, or if there are still undefined fields.
__pydantic_core_schema__: The core schema of the model.
__pydantic_custom_init__: Whether the model has a custom `__init__` function.
__pydantic_decorators__: Metadata containing the decorators defined on the model.
    This replaces `Model.__validators__` and `Model.__root_validators__` from Pydantic V1.
__pydantic_generic_metadata__: Metadata for generic models; contains data used for a similar purpose to
    __args__, __origin__, __parameters__ in typing-module generics. May eventually be replaced by these.
__pydantic_parent_namespace__: Parent namespace of the model, used for automatic rebuilding of models.
__pydantic_post_init__: The name of the post-init method for the model, if defined.
__pydantic_root_model__: Whether the model is a [`RootModel`][pydantic.root_model.RootModel].
__pydantic_serializer__: The `pydantic-core` `SchemaSerializer` used to dump instances of the model.
__pydantic_validator__: The `pydantic-core` `SchemaValidator` used to validate instances of the model.

__pydantic_fields__: A dictionary of field names and their corresponding [`FieldInfo`][pydantic.fields.FieldInfo] objects.
__pydantic_computed_fields__: A dictionary of computed field names and their corresponding [`ComputedFieldInfo`][pydantic.fields.ComputedFieldInfo] objects.

__pydantic_extra__: A dictionary containing extra values, if [`extra`][pydantic.config.ConfigDict.extra]
    is set to `'allow'`.
__pydantic_fields_set__: The names of fields explicitly set during instantiation.
__pydantic_private__: Values of private attributes set on the model instance.

filename: str

size: int

mime_type: str

attachment: kiln_ai.datamodel.basemodel.KilnAttachmentModel

@field_serializer('attachment')

def serialize_attachment( self, attachment: kiln_ai.datamodel.basemodel.KilnAttachmentModel, info: pydantic_core.core_schema.SerializationInfo) -> dict: View Source

223    @field_serializer("attachment")
224    def serialize_attachment(
225        self, attachment: KilnAttachmentModel, info: SerializationInfo
226    ) -> dict:
227        context = info.context or {}
228        context["filename_prefix"] = "attachment"
229        return attachment.model_dump(mode="json", context=context)

@field_validator('mime_type')

@classmethod

def validate_mime_type( cls, mime_type: str, info: pydantic_core.core_schema.ValidationInfo) -> str: View Source

231    @field_validator("mime_type")
232    @classmethod
233    def validate_mime_type(cls, mime_type: str, info: ValidationInfo) -> str:
234        filename = info.data.get("filename") or ""
235
236        for mime_types in SUPPORTED_MIME_TYPES.values():
237            if mime_type in mime_types:
238                return mime_type
239        raise ValueError(f"MIME type is not supported: {mime_type} (for {filename})")

model_config: ClassVar[pydantic.config.ConfigDict] = {}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

class Document(kiln_ai.datamodel.basemodel.KilnParentedModel, kiln_ai.datamodel.basemodel.KilnParentModel): View Source

242class Document(
243    KilnParentedModel, KilnParentModel, parent_of={"extractions": Extraction}
244):
245    # this field should not be changed after creation
246    name: FilenameString = Field(
247        description="A name to identify the document.",
248    )
249
250    # this field can be changed after creation
251    name_override: str | None = Field(
252        description="A friendly name to identify the document. This is used for display purposes and can be different from the name.",
253        default=None,
254    )
255
256    description: str = Field(description="A description for the file")
257
258    original_file: FileInfo = Field(description="The original file")
259
260    kind: Kind = Field(
261        description="The kind of document. The kind is a broad family of filetypes that can be handled in a similar way"
262    )
263
264    tags: List[str] = Field(
265        default_factory=list,
266        description="Tags for the document. Tags are used to categorize documents for filtering and reporting.",
267    )
268
269    @model_validator(mode="after")
270    def validate_tags(self) -> Self:
271        for tag in self.tags:
272            if not tag:
273                raise ValueError("Tags cannot be empty strings")
274            if " " in tag:
275                raise ValueError("Tags cannot contain spaces. Try underscores.")
276
277        return self
278
279    # Workaround to return typed parent without importing Project
280    def parent_project(self) -> Union["Project", None]:
281        if self.parent is None or self.parent.__class__.__name__ != "Project":
282            return None
283        return self.parent  # type: ignore
284
285    def extractions(self, readonly: bool = False) -> list[Extraction]:
286        return super().extractions(readonly=readonly)  # type: ignore
287
288    @computed_field
289    @property
290    def friendly_name(self) -> str:
291        # backward compatibility: old documents did not have name_override
292        return self.name_override or self.name

Base model for Kiln models that have a parent-child relationship. This base class is for child models.

This class provides functionality for managing hierarchical relationships between models, including parent reference handling and file system organization.

Attributes: parent (KilnBaseModel): Reference to the parent model instance. Not persisted, just in memory.

name: Annotated[str, BeforeValidator(func=<function name_validator.<locals>.fn at 0x7f2f1ec0c9a0>, json_schema_input_type=PydanticUndefined)]

name_override: str | None

description: str

original_file: FileInfo

kind: Kind

tags: List[str]

@model_validator(mode='after')

def validate_tags(self) -> Self: View Source

269    @model_validator(mode="after")
270    def validate_tags(self) -> Self:
271        for tag in self.tags:
272            if not tag:
273                raise ValueError("Tags cannot be empty strings")
274            if " " in tag:
275                raise ValueError("Tags cannot contain spaces. Try underscores.")
276
277        return self

def parent_project(self) -> Optional[kiln_ai.datamodel.Project]: View Source

280    def parent_project(self) -> Union["Project", None]:
281        if self.parent is None or self.parent.__class__.__name__ != "Project":
282            return None
283        return self.parent  # type: ignore

def extractions(self, readonly=False) -> List[Extraction]: View Source

695        def child_method(self, readonly: bool = False) -> list[child_class]:
696            return child_class.all_children_of_parent_path(self.path, readonly=readonly)

The type of the None singleton.

friendly_name: str View Source

288    @computed_field
289    @property
290    def friendly_name(self) -> str:
291        # backward compatibility: old documents did not have name_override
292        return self.name_override or self.name

def relationship_name() -> str: View Source

713        def relationship_name_method() -> str:
714            return relationship_name

The type of the None singleton.

def parent_type() -> Type[kiln_ai.datamodel.basemodel.KilnParentModel]: View Source

706        def parent_class_method() -> Type[KilnParentModel]:
707            return cls

The type of the None singleton.

model_config = {'validate_assignment': True}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

def model_post_init(self: pydantic.main.BaseModel, context: Any, /) -> None: View Source

337def init_private_attributes(self: BaseModel, context: Any, /) -> None:
338    """This function is meant to behave like a BaseModel method to initialise private attributes.
339
340    It takes context as an argument since that's what pydantic-core passes when calling it.
341
342    Args:
343        self: The BaseModel instance.
344        context: The context.
345    """
346    if getattr(self, '__pydantic_private__', None) is None:
347        pydantic_private = {}
348        for name, private_attr in self.__private_attributes__.items():
349            default = private_attr.get_default()
350            if default is not PydanticUndefined:
351                pydantic_private[name] = default
352        object_setattr(self, '__pydantic_private__', pydantic_private)

This function is meant to behave like a BaseModel method to initialise private attributes.

It takes context as an argument since that's what pydantic-core passes when calling it.

Args: self: The BaseModel instance. context: The context.

def get_kind_from_mime_type(mime_type: str) -> Kind | None: View Source

295def get_kind_from_mime_type(mime_type: str) -> Kind | None:
296    for kind, mime_types in SUPPORTED_MIME_TYPES.items():
297        if mime_type in mime_types:
298            return kind
299    return None