kiln_ai.datamodel.chunk
1import logging 2from enum import Enum 3from typing import TYPE_CHECKING, List, Union 4 5import anyio 6from pydantic import ( 7 BaseModel, 8 Field, 9 SerializationInfo, 10 ValidationInfo, 11 field_serializer, 12 field_validator, 13) 14 15from kiln_ai.datamodel.basemodel import ( 16 ID_TYPE, 17 FilenameString, 18 KilnAttachmentModel, 19 KilnParentedModel, 20 KilnParentModel, 21) 22from kiln_ai.datamodel.embedding import ChunkEmbeddings 23 24logger = logging.getLogger(__name__) 25 26if TYPE_CHECKING: 27 from kiln_ai.datamodel.extraction import Extraction 28 from kiln_ai.datamodel.project import Project 29 30 31def validate_fixed_window_chunker_properties( 32 properties: dict[str, str | int | float | bool], 33) -> dict[str, str | int | float | bool]: 34 """Validate the properties for the fixed window chunker and set defaults if needed.""" 35 chunk_overlap = properties.get("chunk_overlap") 36 if chunk_overlap is None: 37 raise ValueError("Chunk overlap is required.") 38 39 chunk_size = properties.get("chunk_size") 40 if chunk_size is None: 41 raise ValueError("Chunk size is required.") 42 43 if not isinstance(chunk_overlap, int): 44 raise ValueError("Chunk overlap must be an integer.") 45 if chunk_overlap < 0: 46 raise ValueError("Chunk overlap must be greater than or equal to 0.") 47 48 if not isinstance(chunk_size, int): 49 raise ValueError("Chunk size must be an integer.") 50 if chunk_size <= 0: 51 raise ValueError("Chunk size must be greater than 0.") 52 53 if chunk_overlap >= chunk_size: 54 raise ValueError("Chunk overlap must be less than chunk size.") 55 56 return properties 57 58 59class ChunkerType(str, Enum): 60 FIXED_WINDOW = "fixed_window" 61 62 63class ChunkerConfig(KilnParentedModel): 64 name: FilenameString = Field( 65 description="A name to identify the chunker config.", 66 ) 67 description: str | None = Field( 68 default=None, description="The description of the chunker config" 69 ) 70 chunker_type: ChunkerType = Field( 71 description="This is used to determine the type of chunker to use.", 72 ) 73 properties: dict[str, str | int | float | bool] = Field( 74 description="Properties to be used to execute the chunker config. This is chunker_type specific and should serialize to a json dict.", 75 ) 76 77 # Workaround to return typed parent without importing Project 78 def parent_project(self) -> Union["Project", None]: 79 if self.parent is None or self.parent.__class__.__name__ != "Project": 80 return None 81 return self.parent # type: ignore 82 83 @field_validator("properties") 84 @classmethod 85 def validate_properties( 86 cls, properties: dict[str, str | int | float | bool], info: ValidationInfo 87 ) -> dict[str, str | int | float | bool]: 88 if info.data.get("chunker_type") == ChunkerType.FIXED_WINDOW: 89 # do not trigger revalidation of properties 90 return validate_fixed_window_chunker_properties(properties) 91 return properties 92 93 def chunk_size(self) -> int | None: 94 if self.properties.get("chunk_size") is None: 95 return None 96 if not isinstance(self.properties["chunk_size"], int): 97 raise ValueError("Chunk size must be an integer.") 98 return self.properties["chunk_size"] 99 100 def chunk_overlap(self) -> int | None: 101 if self.properties.get("chunk_overlap") is None: 102 return None 103 if not isinstance(self.properties["chunk_overlap"], int): 104 raise ValueError("Chunk overlap must be an integer.") 105 return self.properties["chunk_overlap"] 106 107 108class Chunk(BaseModel): 109 content: KilnAttachmentModel = Field( 110 description="The content of the chunk, stored as an attachment." 111 ) 112 113 @field_serializer("content") 114 def serialize_content( 115 self, content: KilnAttachmentModel, info: SerializationInfo 116 ) -> dict: 117 context = info.context or {} 118 context["filename_prefix"] = "content" 119 return content.model_dump(mode="json", context=context) 120 121 122class ChunkedDocument( 123 KilnParentedModel, KilnParentModel, parent_of={"chunk_embeddings": ChunkEmbeddings} 124): 125 chunker_config_id: ID_TYPE = Field( 126 description="The ID of the chunker config used to chunk the document.", 127 ) 128 chunks: List[Chunk] = Field(description="The chunks of the document.") 129 130 def parent_extraction(self) -> Union["Extraction", None]: 131 if self.parent is None or self.parent.__class__.__name__ != "Extraction": 132 return None 133 return self.parent # type: ignore 134 135 def chunk_embeddings(self, readonly: bool = False) -> list[ChunkEmbeddings]: 136 return super().chunk_embeddings(readonly=readonly) # type: ignore 137 138 async def load_chunks_text(self) -> list[str]: 139 """Utility to return a list of text for each chunk, loaded from each chunk's content attachment.""" 140 if not self.path: 141 raise ValueError( 142 "Failed to resolve the path of chunk content attachment because the chunk does not have a path." 143 ) 144 145 chunks_text: list[str] = [] 146 for chunk in self.chunks: 147 full_path = chunk.content.resolve_path(self.path.parent) 148 149 try: 150 chunks_text.append( 151 await anyio.Path(full_path).read_text(encoding="utf-8") 152 ) 153 except Exception as e: 154 raise ValueError( 155 f"Failed to read chunk content for {full_path}: {e}" 156 ) from e 157 158 return chunks_text
32def validate_fixed_window_chunker_properties( 33 properties: dict[str, str | int | float | bool], 34) -> dict[str, str | int | float | bool]: 35 """Validate the properties for the fixed window chunker and set defaults if needed.""" 36 chunk_overlap = properties.get("chunk_overlap") 37 if chunk_overlap is None: 38 raise ValueError("Chunk overlap is required.") 39 40 chunk_size = properties.get("chunk_size") 41 if chunk_size is None: 42 raise ValueError("Chunk size is required.") 43 44 if not isinstance(chunk_overlap, int): 45 raise ValueError("Chunk overlap must be an integer.") 46 if chunk_overlap < 0: 47 raise ValueError("Chunk overlap must be greater than or equal to 0.") 48 49 if not isinstance(chunk_size, int): 50 raise ValueError("Chunk size must be an integer.") 51 if chunk_size <= 0: 52 raise ValueError("Chunk size must be greater than 0.") 53 54 if chunk_overlap >= chunk_size: 55 raise ValueError("Chunk overlap must be less than chunk size.") 56 57 return properties
Validate the properties for the fixed window chunker and set defaults if needed.
str(object='') -> str str(bytes_or_buffer[, encoding[, errors]]) -> str
Create a new string object from the given object. If encoding or errors is specified, then the object must expose a data buffer that will be decoded using the given encoding and error handler. Otherwise, returns the result of object.__str__() (if defined) or repr(object). encoding defaults to 'utf-8'. errors defaults to 'strict'.
64class ChunkerConfig(KilnParentedModel): 65 name: FilenameString = Field( 66 description="A name to identify the chunker config.", 67 ) 68 description: str | None = Field( 69 default=None, description="The description of the chunker config" 70 ) 71 chunker_type: ChunkerType = Field( 72 description="This is used to determine the type of chunker to use.", 73 ) 74 properties: dict[str, str | int | float | bool] = Field( 75 description="Properties to be used to execute the chunker config. This is chunker_type specific and should serialize to a json dict.", 76 ) 77 78 # Workaround to return typed parent without importing Project 79 def parent_project(self) -> Union["Project", None]: 80 if self.parent is None or self.parent.__class__.__name__ != "Project": 81 return None 82 return self.parent # type: ignore 83 84 @field_validator("properties") 85 @classmethod 86 def validate_properties( 87 cls, properties: dict[str, str | int | float | bool], info: ValidationInfo 88 ) -> dict[str, str | int | float | bool]: 89 if info.data.get("chunker_type") == ChunkerType.FIXED_WINDOW: 90 # do not trigger revalidation of properties 91 return validate_fixed_window_chunker_properties(properties) 92 return properties 93 94 def chunk_size(self) -> int | None: 95 if self.properties.get("chunk_size") is None: 96 return None 97 if not isinstance(self.properties["chunk_size"], int): 98 raise ValueError("Chunk size must be an integer.") 99 return self.properties["chunk_size"] 100 101 def chunk_overlap(self) -> int | None: 102 if self.properties.get("chunk_overlap") is None: 103 return None 104 if not isinstance(self.properties["chunk_overlap"], int): 105 raise ValueError("Chunk overlap must be an integer.") 106 return self.properties["chunk_overlap"]
Base model for Kiln models that have a parent-child relationship. This base class is for child models.
This class provides functionality for managing hierarchical relationships between models, including parent reference handling and file system organization.
Attributes: parent (KilnBaseModel): Reference to the parent model instance. Not persisted, just in memory.
84 @field_validator("properties") 85 @classmethod 86 def validate_properties( 87 cls, properties: dict[str, str | int | float | bool], info: ValidationInfo 88 ) -> dict[str, str | int | float | bool]: 89 if info.data.get("chunker_type") == ChunkerType.FIXED_WINDOW: 90 # do not trigger revalidation of properties 91 return validate_fixed_window_chunker_properties(properties) 92 return properties
The type of the None singleton.
Configuration for the model, should be a dictionary conforming to [ConfigDict
][pydantic.config.ConfigDict].
337def init_private_attributes(self: BaseModel, context: Any, /) -> None: 338 """This function is meant to behave like a BaseModel method to initialise private attributes. 339 340 It takes context as an argument since that's what pydantic-core passes when calling it. 341 342 Args: 343 self: The BaseModel instance. 344 context: The context. 345 """ 346 if getattr(self, '__pydantic_private__', None) is None: 347 pydantic_private = {} 348 for name, private_attr in self.__private_attributes__.items(): 349 default = private_attr.get_default() 350 if default is not PydanticUndefined: 351 pydantic_private[name] = default 352 object_setattr(self, '__pydantic_private__', pydantic_private)
This function is meant to behave like a BaseModel method to initialise private attributes.
It takes context as an argument since that's what pydantic-core passes when calling it.
Args: self: The BaseModel instance. context: The context.
109class Chunk(BaseModel): 110 content: KilnAttachmentModel = Field( 111 description="The content of the chunk, stored as an attachment." 112 ) 113 114 @field_serializer("content") 115 def serialize_content( 116 self, content: KilnAttachmentModel, info: SerializationInfo 117 ) -> dict: 118 context = info.context or {} 119 context["filename_prefix"] = "content" 120 return content.model_dump(mode="json", context=context)
!!! abstract "Usage Documentation" Models
A base class for creating Pydantic models.
Attributes:
__class_vars__: The names of the class variables defined on the model.
__private_attributes__: Metadata about the private attributes of the model.
__signature__: The synthesized __init__
[Signature
][inspect.Signature] of the model.
__pydantic_complete__: Whether model building is completed, or if there are still undefined fields.
__pydantic_core_schema__: The core schema of the model.
__pydantic_custom_init__: Whether the model has a custom `__init__` function.
__pydantic_decorators__: Metadata containing the decorators defined on the model.
This replaces `Model.__validators__` and `Model.__root_validators__` from Pydantic V1.
__pydantic_generic_metadata__: Metadata for generic models; contains data used for a similar purpose to
__args__, __origin__, __parameters__ in typing-module generics. May eventually be replaced by these.
__pydantic_parent_namespace__: Parent namespace of the model, used for automatic rebuilding of models.
__pydantic_post_init__: The name of the post-init method for the model, if defined.
__pydantic_root_model__: Whether the model is a [`RootModel`][pydantic.root_model.RootModel].
__pydantic_serializer__: The `pydantic-core` `SchemaSerializer` used to dump instances of the model.
__pydantic_validator__: The `pydantic-core` `SchemaValidator` used to validate instances of the model.
__pydantic_fields__: A dictionary of field names and their corresponding [`FieldInfo`][pydantic.fields.FieldInfo] objects.
__pydantic_computed_fields__: A dictionary of computed field names and their corresponding [`ComputedFieldInfo`][pydantic.fields.ComputedFieldInfo] objects.
__pydantic_extra__: A dictionary containing extra values, if [`extra`][pydantic.config.ConfigDict.extra]
is set to `'allow'`.
__pydantic_fields_set__: The names of fields explicitly set during instantiation.
__pydantic_private__: Values of private attributes set on the model instance.
123class ChunkedDocument( 124 KilnParentedModel, KilnParentModel, parent_of={"chunk_embeddings": ChunkEmbeddings} 125): 126 chunker_config_id: ID_TYPE = Field( 127 description="The ID of the chunker config used to chunk the document.", 128 ) 129 chunks: List[Chunk] = Field(description="The chunks of the document.") 130 131 def parent_extraction(self) -> Union["Extraction", None]: 132 if self.parent is None or self.parent.__class__.__name__ != "Extraction": 133 return None 134 return self.parent # type: ignore 135 136 def chunk_embeddings(self, readonly: bool = False) -> list[ChunkEmbeddings]: 137 return super().chunk_embeddings(readonly=readonly) # type: ignore 138 139 async def load_chunks_text(self) -> list[str]: 140 """Utility to return a list of text for each chunk, loaded from each chunk's content attachment.""" 141 if not self.path: 142 raise ValueError( 143 "Failed to resolve the path of chunk content attachment because the chunk does not have a path." 144 ) 145 146 chunks_text: list[str] = [] 147 for chunk in self.chunks: 148 full_path = chunk.content.resolve_path(self.path.parent) 149 150 try: 151 chunks_text.append( 152 await anyio.Path(full_path).read_text(encoding="utf-8") 153 ) 154 except Exception as e: 155 raise ValueError( 156 f"Failed to read chunk content for {full_path}: {e}" 157 ) from e 158 159 return chunks_text
Base model for Kiln models that have a parent-child relationship. This base class is for child models.
This class provides functionality for managing hierarchical relationships between models, including parent reference handling and file system organization.
Attributes: parent (KilnBaseModel): Reference to the parent model instance. Not persisted, just in memory.
643 def child_method(self, readonly: bool = False) -> list[child_class]: 644 return child_class.all_children_of_parent_path(self.path, readonly=readonly)
The type of the None singleton.
139 async def load_chunks_text(self) -> list[str]: 140 """Utility to return a list of text for each chunk, loaded from each chunk's content attachment.""" 141 if not self.path: 142 raise ValueError( 143 "Failed to resolve the path of chunk content attachment because the chunk does not have a path." 144 ) 145 146 chunks_text: list[str] = [] 147 for chunk in self.chunks: 148 full_path = chunk.content.resolve_path(self.path.parent) 149 150 try: 151 chunks_text.append( 152 await anyio.Path(full_path).read_text(encoding="utf-8") 153 ) 154 except Exception as e: 155 raise ValueError( 156 f"Failed to read chunk content for {full_path}: {e}" 157 ) from e 158 159 return chunks_text
Utility to return a list of text for each chunk, loaded from each chunk's content attachment.
The type of the None singleton.
Configuration for the model, should be a dictionary conforming to [ConfigDict
][pydantic.config.ConfigDict].
337def init_private_attributes(self: BaseModel, context: Any, /) -> None: 338 """This function is meant to behave like a BaseModel method to initialise private attributes. 339 340 It takes context as an argument since that's what pydantic-core passes when calling it. 341 342 Args: 343 self: The BaseModel instance. 344 context: The context. 345 """ 346 if getattr(self, '__pydantic_private__', None) is None: 347 pydantic_private = {} 348 for name, private_attr in self.__private_attributes__.items(): 349 default = private_attr.get_default() 350 if default is not PydanticUndefined: 351 pydantic_private[name] = default 352 object_setattr(self, '__pydantic_private__', pydantic_private)
This function is meant to behave like a BaseModel method to initialise private attributes.
It takes context as an argument since that's what pydantic-core passes when calling it.
Args: self: The BaseModel instance. context: The context.