kiln_ai.datamodel.extraction
1import logging 2from enum import Enum 3from typing import TYPE_CHECKING, List, Literal, Union 4 5import anyio 6from pydantic import ( 7 BaseModel, 8 Field, 9 SerializationInfo, 10 ValidationInfo, 11 computed_field, 12 field_serializer, 13 field_validator, 14 model_validator, 15) 16from typing_extensions import Self, TypedDict 17 18from kiln_ai.datamodel.basemodel import ( 19 ID_TYPE, 20 FilenameString, 21 KilnAttachmentModel, 22 KilnParentedModel, 23 KilnParentModel, 24) 25from kiln_ai.datamodel.chunk import ChunkedDocument 26from kiln_ai.utils.validation import NonEmptyString 27 28logger = logging.getLogger(__name__) 29 30if TYPE_CHECKING: 31 from kiln_ai.datamodel.project import Project 32 33logger = logging.getLogger(__name__) 34 35 36class Kind(str, Enum): 37 """The kind of content a document contains.""" 38 39 DOCUMENT = "document" 40 IMAGE = "image" 41 VIDEO = "video" 42 AUDIO = "audio" 43 44 45class OutputFormat(str, Enum): 46 """The output format for extraction results.""" 47 48 TEXT = "text/plain" 49 MARKDOWN = "text/markdown" 50 51 52class ExtractorType(str, Enum): 53 """The type of extractor used to process documents.""" 54 55 LITELLM = "litellm" 56 57 58SUPPORTED_MIME_TYPES = { 59 Kind.DOCUMENT: { 60 "application/pdf", 61 "text/plain", 62 "text/markdown", 63 "text/html", 64 "text/md", 65 }, 66 Kind.IMAGE: { 67 "image/png", 68 "image/jpeg", 69 }, 70 Kind.VIDEO: { 71 "video/mp4", 72 "video/quicktime", 73 }, 74 Kind.AUDIO: { 75 "audio/wav", 76 "audio/mpeg", 77 "audio/ogg", 78 }, 79} 80 81 82class ExtractionModel(BaseModel): 83 """A model available for document extraction.""" 84 85 name: str = Field(description="The model identifier.") 86 label: str = Field(description="A human-readable name for the model.") 87 88 89class ExtractionSource(str, Enum): 90 """Whether the document was processed by an extractor or passed through as-is.""" 91 92 PROCESSED = "processed" 93 PASSTHROUGH = "passthrough" 94 95 96class Extraction( 97 KilnParentedModel, KilnParentModel, parent_of={"chunked_documents": ChunkedDocument} 98): 99 """The result of extracting content from a document.""" 100 101 source: ExtractionSource = Field( 102 description="The source of the extraction.", 103 ) 104 extractor_config_id: ID_TYPE = Field( 105 description="The ID of the extractor config used to extract the data.", 106 ) 107 output: KilnAttachmentModel = Field( 108 description="The extraction output.", 109 ) 110 111 def parent_document(self) -> Union["Document", None]: 112 if self.parent is None or self.parent.__class__.__name__ != "Document": 113 return None 114 return self.parent # type: ignore 115 116 async def output_content(self) -> str | None: 117 if not self.path: 118 raise ValueError( 119 "Failed to resolve the path of extraction output attachment because the extraction does not have a path." 120 ) 121 122 full_path = self.output.resolve_path(self.path.parent) 123 124 try: 125 return await anyio.Path(full_path).read_text(encoding="utf-8") 126 except Exception as e: 127 logger.error( 128 f"Failed to read extraction output for {full_path}: {e}", exc_info=True 129 ) 130 raise ValueError(f"Failed to read extraction output: {e}") 131 132 def chunked_documents(self, readonly: bool = False) -> list[ChunkedDocument]: 133 return super().chunked_documents(readonly=readonly) # type: ignore 134 135 136class LitellmExtractorConfigProperties(TypedDict, total=True): 137 extractor_type: Literal[ExtractorType.LITELLM] 138 prompt_document: NonEmptyString 139 prompt_image: NonEmptyString 140 prompt_video: NonEmptyString 141 prompt_audio: NonEmptyString 142 143 144class ExtractorConfig(KilnParentedModel): 145 """Configuration for extracting content from documents using a specific model and prompts.""" 146 147 name: FilenameString = Field( 148 description="A name to identify the extractor config.", 149 ) 150 is_archived: bool = Field( 151 default=False, 152 description="Whether the extractor config is archived. Archived extractor configs are not shown in the UI and are not available for use.", 153 ) 154 description: str | None = Field( 155 default=None, description="The description of the extractor config" 156 ) 157 model_provider_name: str = Field( 158 description="The name of the model provider to use for the extractor config.", 159 ) 160 model_name: str = Field( 161 description="The name of the model to use for the extractor config.", 162 ) 163 output_format: OutputFormat = Field( 164 default=OutputFormat.MARKDOWN, 165 description="The format to use for the output.", 166 ) 167 passthrough_mimetypes: list[OutputFormat] = Field( 168 default_factory=list, 169 description="If the mimetype is in this list, the extractor will not be used and the text content of the file will be returned as is.", 170 ) 171 extractor_type: ExtractorType = Field( 172 description="This is used to determine the type of extractor to use.", 173 ) 174 properties: LitellmExtractorConfigProperties = Field( 175 description="Properties to be used to execute the extractor config. This is extractor_type specific and should serialize to a json dict.", 176 # the discriminator refers to the properties->extractor_type key (not the extractor_type field on the parent model) 177 discriminator="extractor_type", 178 ) 179 180 @model_validator(mode="before") 181 def upgrade_missing_discriminator_properties( 182 cls, data: dict, info: ValidationInfo 183 ) -> dict: 184 if not info.context or not info.context.get("loading_from_file", False): 185 # Not loading from file, so no need to upgrade 186 return data 187 188 if not isinstance(data, dict): 189 return data 190 191 # backward compatibility: 192 # - we originally did not have the extractor_type in the properties, so we need to add it here 193 # - we started wanted to have extractor_type in the properties to use pydantic's discriminated union feature 194 properties = data.get("properties", {}) 195 if "extractor_type" not in properties: 196 # the extractor_type on the parent model is always there, we just need to add it to the properties 197 properties["extractor_type"] = data["extractor_type"] 198 data["properties"] = properties 199 return data 200 201 @model_validator(mode="after") 202 def ensure_extractor_type_matches_properties(self): 203 # sanity check to ensure the extractor_type matches the properties extractor_type 204 if self.extractor_type != self.properties["extractor_type"]: 205 raise ValueError( 206 f"Extractor type mismatch: {self.extractor_type} != {self.properties['extractor_type']}. This is a bug, please report it." 207 ) 208 return self 209 210 @property 211 def litellm_properties(self) -> LitellmExtractorConfigProperties: 212 if self.properties["extractor_type"] != ExtractorType.LITELLM: 213 raise ValueError( 214 f"Litellm properties are only available for litellm extractor type. Got {self.properties.get('extractor_type')}" 215 ) 216 return self.properties 217 218 # Workaround to return typed parent without importing Project 219 def parent_project(self) -> Union["Project", None]: 220 if self.parent is None or self.parent.__class__.__name__ != "Project": 221 return None 222 return self.parent # type: ignore 223 224 225class FileInfo(BaseModel): 226 """Metadata about an uploaded file.""" 227 228 filename: str = Field(description="The filename of the file") 229 230 size: int = Field(description="The size of the file in bytes") 231 232 mime_type: str = Field(description="The MIME type of the file") 233 234 attachment: KilnAttachmentModel = Field( 235 description="The attachment to the file", 236 ) 237 238 @field_serializer("attachment") 239 def serialize_attachment( 240 self, attachment: KilnAttachmentModel, info: SerializationInfo 241 ) -> dict: 242 context = info.context or {} 243 context["filename_prefix"] = "attachment" 244 return attachment.model_dump(mode="json", context=context) 245 246 @field_validator("mime_type") 247 @classmethod 248 def validate_mime_type(cls, mime_type: str, info: ValidationInfo) -> str: 249 filename = info.data.get("filename") or "" 250 251 for mime_types in SUPPORTED_MIME_TYPES.values(): 252 if mime_type in mime_types: 253 return mime_type 254 raise ValueError(f"MIME type is not supported: {mime_type} (for {filename})") 255 256 257class Document( 258 KilnParentedModel, KilnParentModel, parent_of={"extractions": Extraction} 259): 260 """A document uploaded to a project for extraction and RAG.""" 261 262 name: FilenameString = Field( 263 description="A name to identify the document. Should not be changed after creation.", 264 ) 265 266 # this field can be changed after creation 267 name_override: str | None = Field( 268 description="A friendly name to identify the document. This is used for display purposes and can be different from the name.", 269 default=None, 270 ) 271 272 description: str = Field(description="A description for the file") 273 274 original_file: FileInfo = Field(description="The original file") 275 276 kind: Kind = Field( 277 description="The kind of document. The kind is a broad family of filetypes that can be handled in a similar way" 278 ) 279 280 tags: List[str] = Field( 281 default_factory=list, 282 description="Tags for the document. Tags are used to categorize documents for filtering and reporting.", 283 ) 284 285 @model_validator(mode="after") 286 def validate_tags(self) -> Self: 287 for tag in self.tags: 288 if not tag: 289 raise ValueError("Tags cannot be empty strings") 290 if " " in tag: 291 raise ValueError("Tags cannot contain spaces. Try underscores.") 292 293 return self 294 295 # Workaround to return typed parent without importing Project 296 def parent_project(self) -> Union["Project", None]: 297 if self.parent is None or self.parent.__class__.__name__ != "Project": 298 return None 299 return self.parent # type: ignore 300 301 def extractions(self, readonly: bool = False) -> list[Extraction]: 302 return super().extractions(readonly=readonly) # type: ignore 303 304 @computed_field 305 @property 306 def friendly_name(self) -> str: 307 # backward compatibility: old documents did not have name_override 308 return self.name_override or self.name 309 310 311def get_kind_from_mime_type(mime_type: str) -> Kind | None: 312 for kind, mime_types in SUPPORTED_MIME_TYPES.items(): 313 if mime_type in mime_types: 314 return kind 315 return None
37class Kind(str, Enum): 38 """The kind of content a document contains.""" 39 40 DOCUMENT = "document" 41 IMAGE = "image" 42 VIDEO = "video" 43 AUDIO = "audio"
The kind of content a document contains.
46class OutputFormat(str, Enum): 47 """The output format for extraction results.""" 48 49 TEXT = "text/plain" 50 MARKDOWN = "text/markdown"
The output format for extraction results.
53class ExtractorType(str, Enum): 54 """The type of extractor used to process documents.""" 55 56 LITELLM = "litellm"
The type of extractor used to process documents.
83class ExtractionModel(BaseModel): 84 """A model available for document extraction.""" 85 86 name: str = Field(description="The model identifier.") 87 label: str = Field(description="A human-readable name for the model.")
A model available for document extraction.
90class ExtractionSource(str, Enum): 91 """Whether the document was processed by an extractor or passed through as-is.""" 92 93 PROCESSED = "processed" 94 PASSTHROUGH = "passthrough"
Whether the document was processed by an extractor or passed through as-is.
97class Extraction( 98 KilnParentedModel, KilnParentModel, parent_of={"chunked_documents": ChunkedDocument} 99): 100 """The result of extracting content from a document.""" 101 102 source: ExtractionSource = Field( 103 description="The source of the extraction.", 104 ) 105 extractor_config_id: ID_TYPE = Field( 106 description="The ID of the extractor config used to extract the data.", 107 ) 108 output: KilnAttachmentModel = Field( 109 description="The extraction output.", 110 ) 111 112 def parent_document(self) -> Union["Document", None]: 113 if self.parent is None or self.parent.__class__.__name__ != "Document": 114 return None 115 return self.parent # type: ignore 116 117 async def output_content(self) -> str | None: 118 if not self.path: 119 raise ValueError( 120 "Failed to resolve the path of extraction output attachment because the extraction does not have a path." 121 ) 122 123 full_path = self.output.resolve_path(self.path.parent) 124 125 try: 126 return await anyio.Path(full_path).read_text(encoding="utf-8") 127 except Exception as e: 128 logger.error( 129 f"Failed to read extraction output for {full_path}: {e}", exc_info=True 130 ) 131 raise ValueError(f"Failed to read extraction output: {e}") 132 133 def chunked_documents(self, readonly: bool = False) -> list[ChunkedDocument]: 134 return super().chunked_documents(readonly=readonly) # type: ignore
The result of extracting content from a document.
117 async def output_content(self) -> str | None: 118 if not self.path: 119 raise ValueError( 120 "Failed to resolve the path of extraction output attachment because the extraction does not have a path." 121 ) 122 123 full_path = self.output.resolve_path(self.path.parent) 124 125 try: 126 return await anyio.Path(full_path).read_text(encoding="utf-8") 127 except Exception as e: 128 logger.error( 129 f"Failed to read extraction output for {full_path}: {e}", exc_info=True 130 ) 131 raise ValueError(f"Failed to read extraction output: {e}")
743 def child_method(self, readonly: bool = False) -> list[child_class]: # type: ignore[invalid-type-form] 744 return child_class.all_children_of_parent_path(self.path, readonly=readonly)
The type of the None singleton.
The type of the None singleton.
Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].
337def init_private_attributes(self: BaseModel, context: Any, /) -> None: 338 """This function is meant to behave like a BaseModel method to initialise private attributes. 339 340 It takes context as an argument since that's what pydantic-core passes when calling it. 341 342 Args: 343 self: The BaseModel instance. 344 context: The context. 345 """ 346 if getattr(self, '__pydantic_private__', None) is None: 347 pydantic_private = {} 348 for name, private_attr in self.__private_attributes__.items(): 349 default = private_attr.get_default() 350 if default is not PydanticUndefined: 351 pydantic_private[name] = default 352 object_setattr(self, '__pydantic_private__', pydantic_private)
This function is meant to behave like a BaseModel method to initialise private attributes.
It takes context as an argument since that's what pydantic-core passes when calling it.
Args: self: The BaseModel instance. context: The context.
137class LitellmExtractorConfigProperties(TypedDict, total=True): 138 extractor_type: Literal[ExtractorType.LITELLM] 139 prompt_document: NonEmptyString 140 prompt_image: NonEmptyString 141 prompt_video: NonEmptyString 142 prompt_audio: NonEmptyString
145class ExtractorConfig(KilnParentedModel): 146 """Configuration for extracting content from documents using a specific model and prompts.""" 147 148 name: FilenameString = Field( 149 description="A name to identify the extractor config.", 150 ) 151 is_archived: bool = Field( 152 default=False, 153 description="Whether the extractor config is archived. Archived extractor configs are not shown in the UI and are not available for use.", 154 ) 155 description: str | None = Field( 156 default=None, description="The description of the extractor config" 157 ) 158 model_provider_name: str = Field( 159 description="The name of the model provider to use for the extractor config.", 160 ) 161 model_name: str = Field( 162 description="The name of the model to use for the extractor config.", 163 ) 164 output_format: OutputFormat = Field( 165 default=OutputFormat.MARKDOWN, 166 description="The format to use for the output.", 167 ) 168 passthrough_mimetypes: list[OutputFormat] = Field( 169 default_factory=list, 170 description="If the mimetype is in this list, the extractor will not be used and the text content of the file will be returned as is.", 171 ) 172 extractor_type: ExtractorType = Field( 173 description="This is used to determine the type of extractor to use.", 174 ) 175 properties: LitellmExtractorConfigProperties = Field( 176 description="Properties to be used to execute the extractor config. This is extractor_type specific and should serialize to a json dict.", 177 # the discriminator refers to the properties->extractor_type key (not the extractor_type field on the parent model) 178 discriminator="extractor_type", 179 ) 180 181 @model_validator(mode="before") 182 def upgrade_missing_discriminator_properties( 183 cls, data: dict, info: ValidationInfo 184 ) -> dict: 185 if not info.context or not info.context.get("loading_from_file", False): 186 # Not loading from file, so no need to upgrade 187 return data 188 189 if not isinstance(data, dict): 190 return data 191 192 # backward compatibility: 193 # - we originally did not have the extractor_type in the properties, so we need to add it here 194 # - we started wanted to have extractor_type in the properties to use pydantic's discriminated union feature 195 properties = data.get("properties", {}) 196 if "extractor_type" not in properties: 197 # the extractor_type on the parent model is always there, we just need to add it to the properties 198 properties["extractor_type"] = data["extractor_type"] 199 data["properties"] = properties 200 return data 201 202 @model_validator(mode="after") 203 def ensure_extractor_type_matches_properties(self): 204 # sanity check to ensure the extractor_type matches the properties extractor_type 205 if self.extractor_type != self.properties["extractor_type"]: 206 raise ValueError( 207 f"Extractor type mismatch: {self.extractor_type} != {self.properties['extractor_type']}. This is a bug, please report it." 208 ) 209 return self 210 211 @property 212 def litellm_properties(self) -> LitellmExtractorConfigProperties: 213 if self.properties["extractor_type"] != ExtractorType.LITELLM: 214 raise ValueError( 215 f"Litellm properties are only available for litellm extractor type. Got {self.properties.get('extractor_type')}" 216 ) 217 return self.properties 218 219 # Workaround to return typed parent without importing Project 220 def parent_project(self) -> Union["Project", None]: 221 if self.parent is None or self.parent.__class__.__name__ != "Project": 222 return None 223 return self.parent # type: ignore
Configuration for extracting content from documents using a specific model and prompts.
181 @model_validator(mode="before") 182 def upgrade_missing_discriminator_properties( 183 cls, data: dict, info: ValidationInfo 184 ) -> dict: 185 if not info.context or not info.context.get("loading_from_file", False): 186 # Not loading from file, so no need to upgrade 187 return data 188 189 if not isinstance(data, dict): 190 return data 191 192 # backward compatibility: 193 # - we originally did not have the extractor_type in the properties, so we need to add it here 194 # - we started wanted to have extractor_type in the properties to use pydantic's discriminated union feature 195 properties = data.get("properties", {}) 196 if "extractor_type" not in properties: 197 # the extractor_type on the parent model is always there, we just need to add it to the properties 198 properties["extractor_type"] = data["extractor_type"] 199 data["properties"] = properties 200 return data
202 @model_validator(mode="after") 203 def ensure_extractor_type_matches_properties(self): 204 # sanity check to ensure the extractor_type matches the properties extractor_type 205 if self.extractor_type != self.properties["extractor_type"]: 206 raise ValueError( 207 f"Extractor type mismatch: {self.extractor_type} != {self.properties['extractor_type']}. This is a bug, please report it." 208 ) 209 return self
211 @property 212 def litellm_properties(self) -> LitellmExtractorConfigProperties: 213 if self.properties["extractor_type"] != ExtractorType.LITELLM: 214 raise ValueError( 215 f"Litellm properties are only available for litellm extractor type. Got {self.properties.get('extractor_type')}" 216 ) 217 return self.properties
The type of the None singleton.
Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].
337def init_private_attributes(self: BaseModel, context: Any, /) -> None: 338 """This function is meant to behave like a BaseModel method to initialise private attributes. 339 340 It takes context as an argument since that's what pydantic-core passes when calling it. 341 342 Args: 343 self: The BaseModel instance. 344 context: The context. 345 """ 346 if getattr(self, '__pydantic_private__', None) is None: 347 pydantic_private = {} 348 for name, private_attr in self.__private_attributes__.items(): 349 default = private_attr.get_default() 350 if default is not PydanticUndefined: 351 pydantic_private[name] = default 352 object_setattr(self, '__pydantic_private__', pydantic_private)
This function is meant to behave like a BaseModel method to initialise private attributes.
It takes context as an argument since that's what pydantic-core passes when calling it.
Args: self: The BaseModel instance. context: The context.
226class FileInfo(BaseModel): 227 """Metadata about an uploaded file.""" 228 229 filename: str = Field(description="The filename of the file") 230 231 size: int = Field(description="The size of the file in bytes") 232 233 mime_type: str = Field(description="The MIME type of the file") 234 235 attachment: KilnAttachmentModel = Field( 236 description="The attachment to the file", 237 ) 238 239 @field_serializer("attachment") 240 def serialize_attachment( 241 self, attachment: KilnAttachmentModel, info: SerializationInfo 242 ) -> dict: 243 context = info.context or {} 244 context["filename_prefix"] = "attachment" 245 return attachment.model_dump(mode="json", context=context) 246 247 @field_validator("mime_type") 248 @classmethod 249 def validate_mime_type(cls, mime_type: str, info: ValidationInfo) -> str: 250 filename = info.data.get("filename") or "" 251 252 for mime_types in SUPPORTED_MIME_TYPES.values(): 253 if mime_type in mime_types: 254 return mime_type 255 raise ValueError(f"MIME type is not supported: {mime_type} (for {filename})")
Metadata about an uploaded file.
247 @field_validator("mime_type") 248 @classmethod 249 def validate_mime_type(cls, mime_type: str, info: ValidationInfo) -> str: 250 filename = info.data.get("filename") or "" 251 252 for mime_types in SUPPORTED_MIME_TYPES.values(): 253 if mime_type in mime_types: 254 return mime_type 255 raise ValueError(f"MIME type is not supported: {mime_type} (for {filename})")
258class Document( 259 KilnParentedModel, KilnParentModel, parent_of={"extractions": Extraction} 260): 261 """A document uploaded to a project for extraction and RAG.""" 262 263 name: FilenameString = Field( 264 description="A name to identify the document. Should not be changed after creation.", 265 ) 266 267 # this field can be changed after creation 268 name_override: str | None = Field( 269 description="A friendly name to identify the document. This is used for display purposes and can be different from the name.", 270 default=None, 271 ) 272 273 description: str = Field(description="A description for the file") 274 275 original_file: FileInfo = Field(description="The original file") 276 277 kind: Kind = Field( 278 description="The kind of document. The kind is a broad family of filetypes that can be handled in a similar way" 279 ) 280 281 tags: List[str] = Field( 282 default_factory=list, 283 description="Tags for the document. Tags are used to categorize documents for filtering and reporting.", 284 ) 285 286 @model_validator(mode="after") 287 def validate_tags(self) -> Self: 288 for tag in self.tags: 289 if not tag: 290 raise ValueError("Tags cannot be empty strings") 291 if " " in tag: 292 raise ValueError("Tags cannot contain spaces. Try underscores.") 293 294 return self 295 296 # Workaround to return typed parent without importing Project 297 def parent_project(self) -> Union["Project", None]: 298 if self.parent is None or self.parent.__class__.__name__ != "Project": 299 return None 300 return self.parent # type: ignore 301 302 def extractions(self, readonly: bool = False) -> list[Extraction]: 303 return super().extractions(readonly=readonly) # type: ignore 304 305 @computed_field 306 @property 307 def friendly_name(self) -> str: 308 # backward compatibility: old documents did not have name_override 309 return self.name_override or self.name
A document uploaded to a project for extraction and RAG.
743 def child_method(self, readonly: bool = False) -> list[child_class]: # type: ignore[invalid-type-form] 744 return child_class.all_children_of_parent_path(self.path, readonly=readonly)
The type of the None singleton.
The type of the None singleton.
Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].
337def init_private_attributes(self: BaseModel, context: Any, /) -> None: 338 """This function is meant to behave like a BaseModel method to initialise private attributes. 339 340 It takes context as an argument since that's what pydantic-core passes when calling it. 341 342 Args: 343 self: The BaseModel instance. 344 context: The context. 345 """ 346 if getattr(self, '__pydantic_private__', None) is None: 347 pydantic_private = {} 348 for name, private_attr in self.__private_attributes__.items(): 349 default = private_attr.get_default() 350 if default is not PydanticUndefined: 351 pydantic_private[name] = default 352 object_setattr(self, '__pydantic_private__', pydantic_private)
This function is meant to behave like a BaseModel method to initialise private attributes.
It takes context as an argument since that's what pydantic-core passes when calling it.
Args: self: The BaseModel instance. context: The context.