kiln_ai.datamodel.extraction
1import logging 2from enum import Enum 3from typing import TYPE_CHECKING, List, Literal, Union 4 5import anyio 6from pydantic import ( 7 BaseModel, 8 Field, 9 SerializationInfo, 10 ValidationInfo, 11 computed_field, 12 field_serializer, 13 field_validator, 14 model_validator, 15) 16from typing_extensions import Self, TypedDict 17 18from kiln_ai.datamodel.basemodel import ( 19 ID_TYPE, 20 FilenameString, 21 KilnAttachmentModel, 22 KilnParentedModel, 23 KilnParentModel, 24) 25from kiln_ai.datamodel.chunk import ChunkedDocument 26from kiln_ai.utils.validation import NonEmptyString 27 28logger = logging.getLogger(__name__) 29 30if TYPE_CHECKING: 31 from kiln_ai.datamodel.project import Project 32 33logger = logging.getLogger(__name__) 34 35 36class Kind(str, Enum): 37 DOCUMENT = "document" 38 IMAGE = "image" 39 VIDEO = "video" 40 AUDIO = "audio" 41 42 43class OutputFormat(str, Enum): 44 TEXT = "text/plain" 45 MARKDOWN = "text/markdown" 46 47 48class ExtractorType(str, Enum): 49 LITELLM = "litellm" 50 51 52SUPPORTED_MIME_TYPES = { 53 Kind.DOCUMENT: { 54 "application/pdf", 55 "text/plain", 56 "text/markdown", 57 "text/html", 58 "text/md", 59 }, 60 Kind.IMAGE: { 61 "image/png", 62 "image/jpeg", 63 }, 64 Kind.VIDEO: { 65 "video/mp4", 66 "video/quicktime", 67 }, 68 Kind.AUDIO: { 69 "audio/wav", 70 "audio/mpeg", 71 "audio/ogg", 72 }, 73} 74 75 76class ExtractionModel(BaseModel): 77 name: str 78 label: str 79 80 81class ExtractionSource(str, Enum): 82 PROCESSED = "processed" 83 PASSTHROUGH = "passthrough" 84 85 86class Extraction( 87 KilnParentedModel, KilnParentModel, parent_of={"chunked_documents": ChunkedDocument} 88): 89 source: ExtractionSource = Field( 90 description="The source of the extraction.", 91 ) 92 extractor_config_id: ID_TYPE = Field( 93 description="The ID of the extractor config used to extract the data.", 94 ) 95 output: KilnAttachmentModel = Field( 96 description="The extraction output.", 97 ) 98 99 def parent_document(self) -> Union["Document", None]: 100 if self.parent is None or self.parent.__class__.__name__ != "Document": 101 return None 102 return self.parent # type: ignore 103 104 async def output_content(self) -> str | None: 105 if not self.path: 106 raise ValueError( 107 "Failed to resolve the path of extraction output attachment because the extraction does not have a path." 108 ) 109 110 full_path = self.output.resolve_path(self.path.parent) 111 112 try: 113 return await anyio.Path(full_path).read_text(encoding="utf-8") 114 except Exception as e: 115 logger.error( 116 f"Failed to read extraction output for {full_path}: {e}", exc_info=True 117 ) 118 raise ValueError(f"Failed to read extraction output: {e}") 119 120 def chunked_documents(self, readonly: bool = False) -> list[ChunkedDocument]: 121 return super().chunked_documents(readonly=readonly) # type: ignore 122 123 124class LitellmExtractorConfigProperties(TypedDict, total=True): 125 extractor_type: Literal[ExtractorType.LITELLM] 126 prompt_document: NonEmptyString 127 prompt_image: NonEmptyString 128 prompt_video: NonEmptyString 129 prompt_audio: NonEmptyString 130 131 132class ExtractorConfig(KilnParentedModel): 133 name: FilenameString = Field( 134 description="A name to identify the extractor config.", 135 ) 136 is_archived: bool = Field( 137 default=False, 138 description="Whether the extractor config is archived. Archived extractor configs are not shown in the UI and are not available for use.", 139 ) 140 description: str | None = Field( 141 default=None, description="The description of the extractor config" 142 ) 143 model_provider_name: str = Field( 144 description="The name of the model provider to use for the extractor config.", 145 ) 146 model_name: str = Field( 147 description="The name of the model to use for the extractor config.", 148 ) 149 output_format: OutputFormat = Field( 150 default=OutputFormat.MARKDOWN, 151 description="The format to use for the output.", 152 ) 153 passthrough_mimetypes: list[OutputFormat] = Field( 154 default_factory=list, 155 description="If the mimetype is in this list, the extractor will not be used and the text content of the file will be returned as is.", 156 ) 157 extractor_type: ExtractorType = Field( 158 description="This is used to determine the type of extractor to use.", 159 ) 160 properties: LitellmExtractorConfigProperties = Field( 161 description="Properties to be used to execute the extractor config. This is extractor_type specific and should serialize to a json dict.", 162 # the discriminator refers to the properties->extractor_type key (not the extractor_type field on the parent model) 163 discriminator="extractor_type", 164 ) 165 166 @model_validator(mode="before") 167 def upgrade_missing_discriminator_properties( 168 cls, data: dict, info: ValidationInfo 169 ) -> dict: 170 if not info.context or not info.context.get("loading_from_file", False): 171 # Not loading from file, so no need to upgrade 172 return data 173 174 if not isinstance(data, dict): 175 return data 176 177 # backward compatibility: 178 # - we originally did not have the extractor_type in the properties, so we need to add it here 179 # - we started wanted to have extractor_type in the properties to use pydantic's discriminated union feature 180 properties = data.get("properties", {}) 181 if "extractor_type" not in properties: 182 # the extractor_type on the parent model is always there, we just need to add it to the properties 183 properties["extractor_type"] = data["extractor_type"] 184 data["properties"] = properties 185 return data 186 187 @model_validator(mode="after") 188 def ensure_extractor_type_matches_properties(self): 189 # sanity check to ensure the extractor_type matches the properties extractor_type 190 if self.extractor_type != self.properties["extractor_type"]: 191 raise ValueError( 192 f"Extractor type mismatch: {self.extractor_type} != {self.properties['extractor_type']}. This is a bug, please report it." 193 ) 194 return self 195 196 @property 197 def litellm_properties(self) -> LitellmExtractorConfigProperties: 198 if self.properties["extractor_type"] != ExtractorType.LITELLM: 199 raise ValueError( 200 f"Litellm properties are only available for litellm extractor type. Got {self.properties.get('extractor_type')}" 201 ) 202 return self.properties 203 204 # Workaround to return typed parent without importing Project 205 def parent_project(self) -> Union["Project", None]: 206 if self.parent is None or self.parent.__class__.__name__ != "Project": 207 return None 208 return self.parent # type: ignore 209 210 211class FileInfo(BaseModel): 212 filename: str = Field(description="The filename of the file") 213 214 size: int = Field(description="The size of the file in bytes") 215 216 mime_type: str = Field(description="The MIME type of the file") 217 218 attachment: KilnAttachmentModel = Field( 219 description="The attachment to the file", 220 ) 221 222 @field_serializer("attachment") 223 def serialize_attachment( 224 self, attachment: KilnAttachmentModel, info: SerializationInfo 225 ) -> dict: 226 context = info.context or {} 227 context["filename_prefix"] = "attachment" 228 return attachment.model_dump(mode="json", context=context) 229 230 @field_validator("mime_type") 231 @classmethod 232 def validate_mime_type(cls, mime_type: str, info: ValidationInfo) -> str: 233 filename = info.data.get("filename") or "" 234 235 for mime_types in SUPPORTED_MIME_TYPES.values(): 236 if mime_type in mime_types: 237 return mime_type 238 raise ValueError(f"MIME type is not supported: {mime_type} (for {filename})") 239 240 241class Document( 242 KilnParentedModel, KilnParentModel, parent_of={"extractions": Extraction} 243): 244 # this field should not be changed after creation 245 name: FilenameString = Field( 246 description="A name to identify the document.", 247 ) 248 249 # this field can be changed after creation 250 name_override: str | None = Field( 251 description="A friendly name to identify the document. This is used for display purposes and can be different from the name.", 252 default=None, 253 ) 254 255 description: str = Field(description="A description for the file") 256 257 original_file: FileInfo = Field(description="The original file") 258 259 kind: Kind = Field( 260 description="The kind of document. The kind is a broad family of filetypes that can be handled in a similar way" 261 ) 262 263 tags: List[str] = Field( 264 default_factory=list, 265 description="Tags for the document. Tags are used to categorize documents for filtering and reporting.", 266 ) 267 268 @model_validator(mode="after") 269 def validate_tags(self) -> Self: 270 for tag in self.tags: 271 if not tag: 272 raise ValueError("Tags cannot be empty strings") 273 if " " in tag: 274 raise ValueError("Tags cannot contain spaces. Try underscores.") 275 276 return self 277 278 # Workaround to return typed parent without importing Project 279 def parent_project(self) -> Union["Project", None]: 280 if self.parent is None or self.parent.__class__.__name__ != "Project": 281 return None 282 return self.parent # type: ignore 283 284 def extractions(self, readonly: bool = False) -> list[Extraction]: 285 return super().extractions(readonly=readonly) # type: ignore 286 287 @computed_field 288 @property 289 def friendly_name(self) -> str: 290 # backward compatibility: old documents did not have name_override 291 return self.name_override or self.name 292 293 294def get_kind_from_mime_type(mime_type: str) -> Kind | None: 295 for kind, mime_types in SUPPORTED_MIME_TYPES.items(): 296 if mime_type in mime_types: 297 return kind 298 return None
37class Kind(str, Enum): 38 DOCUMENT = "document" 39 IMAGE = "image" 40 VIDEO = "video" 41 AUDIO = "audio"
str(object='') -> str str(bytes_or_buffer[, encoding[, errors]]) -> str
Create a new string object from the given object. If encoding or errors is specified, then the object must expose a data buffer that will be decoded using the given encoding and error handler. Otherwise, returns the result of object.__str__() (if defined) or repr(object). encoding defaults to 'utf-8'. errors defaults to 'strict'.
str(object='') -> str str(bytes_or_buffer[, encoding[, errors]]) -> str
Create a new string object from the given object. If encoding or errors is specified, then the object must expose a data buffer that will be decoded using the given encoding and error handler. Otherwise, returns the result of object.__str__() (if defined) or repr(object). encoding defaults to 'utf-8'. errors defaults to 'strict'.
str(object='') -> str str(bytes_or_buffer[, encoding[, errors]]) -> str
Create a new string object from the given object. If encoding or errors is specified, then the object must expose a data buffer that will be decoded using the given encoding and error handler. Otherwise, returns the result of object.__str__() (if defined) or repr(object). encoding defaults to 'utf-8'. errors defaults to 'strict'.
!!! abstract "Usage Documentation" Models
A base class for creating Pydantic models.
Attributes:
__class_vars__: The names of the class variables defined on the model.
__private_attributes__: Metadata about the private attributes of the model.
__signature__: The synthesized __init__ [Signature][inspect.Signature] of the model.
__pydantic_complete__: Whether model building is completed, or if there are still undefined fields.
__pydantic_core_schema__: The core schema of the model.
__pydantic_custom_init__: Whether the model has a custom `__init__` function.
__pydantic_decorators__: Metadata containing the decorators defined on the model.
This replaces `Model.__validators__` and `Model.__root_validators__` from Pydantic V1.
__pydantic_generic_metadata__: Metadata for generic models; contains data used for a similar purpose to
__args__, __origin__, __parameters__ in typing-module generics. May eventually be replaced by these.
__pydantic_parent_namespace__: Parent namespace of the model, used for automatic rebuilding of models.
__pydantic_post_init__: The name of the post-init method for the model, if defined.
__pydantic_root_model__: Whether the model is a [`RootModel`][pydantic.root_model.RootModel].
__pydantic_serializer__: The `pydantic-core` `SchemaSerializer` used to dump instances of the model.
__pydantic_validator__: The `pydantic-core` `SchemaValidator` used to validate instances of the model.
__pydantic_fields__: A dictionary of field names and their corresponding [`FieldInfo`][pydantic.fields.FieldInfo] objects.
__pydantic_computed_fields__: A dictionary of computed field names and their corresponding [`ComputedFieldInfo`][pydantic.fields.ComputedFieldInfo] objects.
__pydantic_extra__: A dictionary containing extra values, if [`extra`][pydantic.config.ConfigDict.extra]
is set to `'allow'`.
__pydantic_fields_set__: The names of fields explicitly set during instantiation.
__pydantic_private__: Values of private attributes set on the model instance.
str(object='') -> str str(bytes_or_buffer[, encoding[, errors]]) -> str
Create a new string object from the given object. If encoding or errors is specified, then the object must expose a data buffer that will be decoded using the given encoding and error handler. Otherwise, returns the result of object.__str__() (if defined) or repr(object). encoding defaults to 'utf-8'. errors defaults to 'strict'.
87class Extraction( 88 KilnParentedModel, KilnParentModel, parent_of={"chunked_documents": ChunkedDocument} 89): 90 source: ExtractionSource = Field( 91 description="The source of the extraction.", 92 ) 93 extractor_config_id: ID_TYPE = Field( 94 description="The ID of the extractor config used to extract the data.", 95 ) 96 output: KilnAttachmentModel = Field( 97 description="The extraction output.", 98 ) 99 100 def parent_document(self) -> Union["Document", None]: 101 if self.parent is None or self.parent.__class__.__name__ != "Document": 102 return None 103 return self.parent # type: ignore 104 105 async def output_content(self) -> str | None: 106 if not self.path: 107 raise ValueError( 108 "Failed to resolve the path of extraction output attachment because the extraction does not have a path." 109 ) 110 111 full_path = self.output.resolve_path(self.path.parent) 112 113 try: 114 return await anyio.Path(full_path).read_text(encoding="utf-8") 115 except Exception as e: 116 logger.error( 117 f"Failed to read extraction output for {full_path}: {e}", exc_info=True 118 ) 119 raise ValueError(f"Failed to read extraction output: {e}") 120 121 def chunked_documents(self, readonly: bool = False) -> list[ChunkedDocument]: 122 return super().chunked_documents(readonly=readonly) # type: ignore
Base model for Kiln models that have a parent-child relationship. This base class is for child models.
This class provides functionality for managing hierarchical relationships between models, including parent reference handling and file system organization.
Attributes: parent (KilnBaseModel): Reference to the parent model instance. Not persisted, just in memory.
105 async def output_content(self) -> str | None: 106 if not self.path: 107 raise ValueError( 108 "Failed to resolve the path of extraction output attachment because the extraction does not have a path." 109 ) 110 111 full_path = self.output.resolve_path(self.path.parent) 112 113 try: 114 return await anyio.Path(full_path).read_text(encoding="utf-8") 115 except Exception as e: 116 logger.error( 117 f"Failed to read extraction output for {full_path}: {e}", exc_info=True 118 ) 119 raise ValueError(f"Failed to read extraction output: {e}")
695 def child_method(self, readonly: bool = False) -> list[child_class]: 696 return child_class.all_children_of_parent_path(self.path, readonly=readonly)
The type of the None singleton.
The type of the None singleton.
Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].
337def init_private_attributes(self: BaseModel, context: Any, /) -> None: 338 """This function is meant to behave like a BaseModel method to initialise private attributes. 339 340 It takes context as an argument since that's what pydantic-core passes when calling it. 341 342 Args: 343 self: The BaseModel instance. 344 context: The context. 345 """ 346 if getattr(self, '__pydantic_private__', None) is None: 347 pydantic_private = {} 348 for name, private_attr in self.__private_attributes__.items(): 349 default = private_attr.get_default() 350 if default is not PydanticUndefined: 351 pydantic_private[name] = default 352 object_setattr(self, '__pydantic_private__', pydantic_private)
This function is meant to behave like a BaseModel method to initialise private attributes.
It takes context as an argument since that's what pydantic-core passes when calling it.
Args: self: The BaseModel instance. context: The context.
125class LitellmExtractorConfigProperties(TypedDict, total=True): 126 extractor_type: Literal[ExtractorType.LITELLM] 127 prompt_document: NonEmptyString 128 prompt_image: NonEmptyString 129 prompt_video: NonEmptyString 130 prompt_audio: NonEmptyString
133class ExtractorConfig(KilnParentedModel): 134 name: FilenameString = Field( 135 description="A name to identify the extractor config.", 136 ) 137 is_archived: bool = Field( 138 default=False, 139 description="Whether the extractor config is archived. Archived extractor configs are not shown in the UI and are not available for use.", 140 ) 141 description: str | None = Field( 142 default=None, description="The description of the extractor config" 143 ) 144 model_provider_name: str = Field( 145 description="The name of the model provider to use for the extractor config.", 146 ) 147 model_name: str = Field( 148 description="The name of the model to use for the extractor config.", 149 ) 150 output_format: OutputFormat = Field( 151 default=OutputFormat.MARKDOWN, 152 description="The format to use for the output.", 153 ) 154 passthrough_mimetypes: list[OutputFormat] = Field( 155 default_factory=list, 156 description="If the mimetype is in this list, the extractor will not be used and the text content of the file will be returned as is.", 157 ) 158 extractor_type: ExtractorType = Field( 159 description="This is used to determine the type of extractor to use.", 160 ) 161 properties: LitellmExtractorConfigProperties = Field( 162 description="Properties to be used to execute the extractor config. This is extractor_type specific and should serialize to a json dict.", 163 # the discriminator refers to the properties->extractor_type key (not the extractor_type field on the parent model) 164 discriminator="extractor_type", 165 ) 166 167 @model_validator(mode="before") 168 def upgrade_missing_discriminator_properties( 169 cls, data: dict, info: ValidationInfo 170 ) -> dict: 171 if not info.context or not info.context.get("loading_from_file", False): 172 # Not loading from file, so no need to upgrade 173 return data 174 175 if not isinstance(data, dict): 176 return data 177 178 # backward compatibility: 179 # - we originally did not have the extractor_type in the properties, so we need to add it here 180 # - we started wanted to have extractor_type in the properties to use pydantic's discriminated union feature 181 properties = data.get("properties", {}) 182 if "extractor_type" not in properties: 183 # the extractor_type on the parent model is always there, we just need to add it to the properties 184 properties["extractor_type"] = data["extractor_type"] 185 data["properties"] = properties 186 return data 187 188 @model_validator(mode="after") 189 def ensure_extractor_type_matches_properties(self): 190 # sanity check to ensure the extractor_type matches the properties extractor_type 191 if self.extractor_type != self.properties["extractor_type"]: 192 raise ValueError( 193 f"Extractor type mismatch: {self.extractor_type} != {self.properties['extractor_type']}. This is a bug, please report it." 194 ) 195 return self 196 197 @property 198 def litellm_properties(self) -> LitellmExtractorConfigProperties: 199 if self.properties["extractor_type"] != ExtractorType.LITELLM: 200 raise ValueError( 201 f"Litellm properties are only available for litellm extractor type. Got {self.properties.get('extractor_type')}" 202 ) 203 return self.properties 204 205 # Workaround to return typed parent without importing Project 206 def parent_project(self) -> Union["Project", None]: 207 if self.parent is None or self.parent.__class__.__name__ != "Project": 208 return None 209 return self.parent # type: ignore
Base model for Kiln models that have a parent-child relationship. This base class is for child models.
This class provides functionality for managing hierarchical relationships between models, including parent reference handling and file system organization.
Attributes: parent (KilnBaseModel): Reference to the parent model instance. Not persisted, just in memory.
167 @model_validator(mode="before") 168 def upgrade_missing_discriminator_properties( 169 cls, data: dict, info: ValidationInfo 170 ) -> dict: 171 if not info.context or not info.context.get("loading_from_file", False): 172 # Not loading from file, so no need to upgrade 173 return data 174 175 if not isinstance(data, dict): 176 return data 177 178 # backward compatibility: 179 # - we originally did not have the extractor_type in the properties, so we need to add it here 180 # - we started wanted to have extractor_type in the properties to use pydantic's discriminated union feature 181 properties = data.get("properties", {}) 182 if "extractor_type" not in properties: 183 # the extractor_type on the parent model is always there, we just need to add it to the properties 184 properties["extractor_type"] = data["extractor_type"] 185 data["properties"] = properties 186 return data
188 @model_validator(mode="after") 189 def ensure_extractor_type_matches_properties(self): 190 # sanity check to ensure the extractor_type matches the properties extractor_type 191 if self.extractor_type != self.properties["extractor_type"]: 192 raise ValueError( 193 f"Extractor type mismatch: {self.extractor_type} != {self.properties['extractor_type']}. This is a bug, please report it." 194 ) 195 return self
197 @property 198 def litellm_properties(self) -> LitellmExtractorConfigProperties: 199 if self.properties["extractor_type"] != ExtractorType.LITELLM: 200 raise ValueError( 201 f"Litellm properties are only available for litellm extractor type. Got {self.properties.get('extractor_type')}" 202 ) 203 return self.properties
The type of the None singleton.
Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].
337def init_private_attributes(self: BaseModel, context: Any, /) -> None: 338 """This function is meant to behave like a BaseModel method to initialise private attributes. 339 340 It takes context as an argument since that's what pydantic-core passes when calling it. 341 342 Args: 343 self: The BaseModel instance. 344 context: The context. 345 """ 346 if getattr(self, '__pydantic_private__', None) is None: 347 pydantic_private = {} 348 for name, private_attr in self.__private_attributes__.items(): 349 default = private_attr.get_default() 350 if default is not PydanticUndefined: 351 pydantic_private[name] = default 352 object_setattr(self, '__pydantic_private__', pydantic_private)
This function is meant to behave like a BaseModel method to initialise private attributes.
It takes context as an argument since that's what pydantic-core passes when calling it.
Args: self: The BaseModel instance. context: The context.
212class FileInfo(BaseModel): 213 filename: str = Field(description="The filename of the file") 214 215 size: int = Field(description="The size of the file in bytes") 216 217 mime_type: str = Field(description="The MIME type of the file") 218 219 attachment: KilnAttachmentModel = Field( 220 description="The attachment to the file", 221 ) 222 223 @field_serializer("attachment") 224 def serialize_attachment( 225 self, attachment: KilnAttachmentModel, info: SerializationInfo 226 ) -> dict: 227 context = info.context or {} 228 context["filename_prefix"] = "attachment" 229 return attachment.model_dump(mode="json", context=context) 230 231 @field_validator("mime_type") 232 @classmethod 233 def validate_mime_type(cls, mime_type: str, info: ValidationInfo) -> str: 234 filename = info.data.get("filename") or "" 235 236 for mime_types in SUPPORTED_MIME_TYPES.values(): 237 if mime_type in mime_types: 238 return mime_type 239 raise ValueError(f"MIME type is not supported: {mime_type} (for {filename})")
!!! abstract "Usage Documentation" Models
A base class for creating Pydantic models.
Attributes:
__class_vars__: The names of the class variables defined on the model.
__private_attributes__: Metadata about the private attributes of the model.
__signature__: The synthesized __init__ [Signature][inspect.Signature] of the model.
__pydantic_complete__: Whether model building is completed, or if there are still undefined fields.
__pydantic_core_schema__: The core schema of the model.
__pydantic_custom_init__: Whether the model has a custom `__init__` function.
__pydantic_decorators__: Metadata containing the decorators defined on the model.
This replaces `Model.__validators__` and `Model.__root_validators__` from Pydantic V1.
__pydantic_generic_metadata__: Metadata for generic models; contains data used for a similar purpose to
__args__, __origin__, __parameters__ in typing-module generics. May eventually be replaced by these.
__pydantic_parent_namespace__: Parent namespace of the model, used for automatic rebuilding of models.
__pydantic_post_init__: The name of the post-init method for the model, if defined.
__pydantic_root_model__: Whether the model is a [`RootModel`][pydantic.root_model.RootModel].
__pydantic_serializer__: The `pydantic-core` `SchemaSerializer` used to dump instances of the model.
__pydantic_validator__: The `pydantic-core` `SchemaValidator` used to validate instances of the model.
__pydantic_fields__: A dictionary of field names and their corresponding [`FieldInfo`][pydantic.fields.FieldInfo] objects.
__pydantic_computed_fields__: A dictionary of computed field names and their corresponding [`ComputedFieldInfo`][pydantic.fields.ComputedFieldInfo] objects.
__pydantic_extra__: A dictionary containing extra values, if [`extra`][pydantic.config.ConfigDict.extra]
is set to `'allow'`.
__pydantic_fields_set__: The names of fields explicitly set during instantiation.
__pydantic_private__: Values of private attributes set on the model instance.
231 @field_validator("mime_type") 232 @classmethod 233 def validate_mime_type(cls, mime_type: str, info: ValidationInfo) -> str: 234 filename = info.data.get("filename") or "" 235 236 for mime_types in SUPPORTED_MIME_TYPES.values(): 237 if mime_type in mime_types: 238 return mime_type 239 raise ValueError(f"MIME type is not supported: {mime_type} (for {filename})")
242class Document( 243 KilnParentedModel, KilnParentModel, parent_of={"extractions": Extraction} 244): 245 # this field should not be changed after creation 246 name: FilenameString = Field( 247 description="A name to identify the document.", 248 ) 249 250 # this field can be changed after creation 251 name_override: str | None = Field( 252 description="A friendly name to identify the document. This is used for display purposes and can be different from the name.", 253 default=None, 254 ) 255 256 description: str = Field(description="A description for the file") 257 258 original_file: FileInfo = Field(description="The original file") 259 260 kind: Kind = Field( 261 description="The kind of document. The kind is a broad family of filetypes that can be handled in a similar way" 262 ) 263 264 tags: List[str] = Field( 265 default_factory=list, 266 description="Tags for the document. Tags are used to categorize documents for filtering and reporting.", 267 ) 268 269 @model_validator(mode="after") 270 def validate_tags(self) -> Self: 271 for tag in self.tags: 272 if not tag: 273 raise ValueError("Tags cannot be empty strings") 274 if " " in tag: 275 raise ValueError("Tags cannot contain spaces. Try underscores.") 276 277 return self 278 279 # Workaround to return typed parent without importing Project 280 def parent_project(self) -> Union["Project", None]: 281 if self.parent is None or self.parent.__class__.__name__ != "Project": 282 return None 283 return self.parent # type: ignore 284 285 def extractions(self, readonly: bool = False) -> list[Extraction]: 286 return super().extractions(readonly=readonly) # type: ignore 287 288 @computed_field 289 @property 290 def friendly_name(self) -> str: 291 # backward compatibility: old documents did not have name_override 292 return self.name_override or self.name
Base model for Kiln models that have a parent-child relationship. This base class is for child models.
This class provides functionality for managing hierarchical relationships between models, including parent reference handling and file system organization.
Attributes: parent (KilnBaseModel): Reference to the parent model instance. Not persisted, just in memory.
695 def child_method(self, readonly: bool = False) -> list[child_class]: 696 return child_class.all_children_of_parent_path(self.path, readonly=readonly)
The type of the None singleton.
The type of the None singleton.
Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].
337def init_private_attributes(self: BaseModel, context: Any, /) -> None: 338 """This function is meant to behave like a BaseModel method to initialise private attributes. 339 340 It takes context as an argument since that's what pydantic-core passes when calling it. 341 342 Args: 343 self: The BaseModel instance. 344 context: The context. 345 """ 346 if getattr(self, '__pydantic_private__', None) is None: 347 pydantic_private = {} 348 for name, private_attr in self.__private_attributes__.items(): 349 default = private_attr.get_default() 350 if default is not PydanticUndefined: 351 pydantic_private[name] = default 352 object_setattr(self, '__pydantic_private__', pydantic_private)
This function is meant to behave like a BaseModel method to initialise private attributes.
It takes context as an argument since that's what pydantic-core passes when calling it.
Args: self: The BaseModel instance. context: The context.