kiln_ai.datamodel.chunk
1import logging 2from enum import Enum 3from typing import TYPE_CHECKING, Annotated, List, Union 4 5import anyio 6from pydantic import ( 7 AfterValidator, 8 BaseModel, 9 Field, 10 NonNegativeInt, 11 PositiveInt, 12 SerializationInfo, 13 ValidationInfo, 14 field_serializer, 15 model_validator, 16) 17from typing_extensions import Literal, TypedDict 18 19from kiln_ai.datamodel.basemodel import ( 20 ID_TYPE, 21 FilenameString, 22 KilnAttachmentModel, 23 KilnParentedModel, 24 KilnParentModel, 25) 26from kiln_ai.datamodel.embedding import ChunkEmbeddings 27 28logger = logging.getLogger(__name__) 29 30if TYPE_CHECKING: 31 from kiln_ai.datamodel.extraction import Extraction 32 from kiln_ai.datamodel.project import Project 33 34 35class ChunkerType(str, Enum): 36 """The type of chunking algorithm to use.""" 37 38 FIXED_WINDOW = "fixed_window" 39 SEMANTIC = "semantic" 40 41 42class SemanticChunkerProperties(TypedDict, total=True): 43 chunker_type: Literal[ChunkerType.SEMANTIC] 44 embedding_config_id: str 45 buffer_size: PositiveInt 46 breakpoint_percentile_threshold: NonNegativeInt 47 include_metadata: bool 48 include_prev_next_rel: bool 49 50 51class FixedWindowChunkerProperties(TypedDict, total=True): 52 chunker_type: Literal[ChunkerType.FIXED_WINDOW] 53 chunk_overlap: NonNegativeInt 54 chunk_size: PositiveInt 55 56 57def validate_fixed_window_chunker_properties( 58 properties: FixedWindowChunkerProperties, 59) -> FixedWindowChunkerProperties: 60 """Validate the properties for the fixed window chunker and set defaults if needed.""" 61 # the typed dict only validates the shape and types, but not the logic, so we validate here 62 if properties["chunk_overlap"] >= properties["chunk_size"]: 63 raise ValueError("Chunk overlap must be less than chunk size.") 64 65 return properties 66 67 68def validate_semantic_chunker_properties( 69 properties: SemanticChunkerProperties, 70) -> SemanticChunkerProperties: 71 """Validate the properties for the semantic chunker.""" 72 buffer_size = properties["buffer_size"] 73 if buffer_size < 1: 74 raise ValueError("buffer_size must be greater than or equal to 1.") 75 76 breakpoint_percentile_threshold = properties["breakpoint_percentile_threshold"] 77 if not (0 <= breakpoint_percentile_threshold <= 100): 78 raise ValueError("breakpoint_percentile_threshold must be between 0 and 100.") 79 80 return properties 81 82 83SemanticChunkerPropertiesValidator = Annotated[ 84 SemanticChunkerProperties, 85 AfterValidator(lambda v: validate_semantic_chunker_properties(v)), 86] 87 88FixedWindowChunkerPropertiesValidator = Annotated[ 89 FixedWindowChunkerProperties, 90 AfterValidator(lambda v: validate_fixed_window_chunker_properties(v)), 91] 92 93 94class ChunkerConfig(KilnParentedModel): 95 """Configuration for chunking extracted documents into smaller pieces.""" 96 97 name: FilenameString = Field( 98 description="A name to identify the chunker config.", 99 ) 100 description: str | None = Field( 101 default=None, description="The description of the chunker config" 102 ) 103 chunker_type: ChunkerType = Field( 104 description="This is used to determine the type of chunker to use.", 105 ) 106 properties: ( 107 SemanticChunkerPropertiesValidator | FixedWindowChunkerPropertiesValidator 108 ) = Field( 109 description="Properties to be used to execute the chunker config. This is chunker_type specific and should serialize to a json dict.", 110 discriminator="chunker_type", 111 ) 112 113 # Workaround to return typed parent without importing Project 114 def parent_project(self) -> Union["Project", None]: 115 if self.parent is None or self.parent.__class__.__name__ != "Project": 116 return None 117 return self.parent # type: ignore 118 119 @model_validator(mode="before") 120 def upgrade_missing_discriminator_properties( 121 cls, data: dict, info: ValidationInfo 122 ) -> dict: 123 if not info.context or not info.context.get("loading_from_file", False): 124 # Not loading from file, so no need to upgrade 125 return data 126 127 if not isinstance(data, dict): 128 return data 129 130 # backward compatibility: 131 # - we originally did not have the chunker_type in the properties, so we need to add it here 132 # - we started wanted to have chunker_type in the properties to use pydantic's discriminated union feature 133 properties = data.get("properties", {}) 134 if "chunker_type" not in properties: 135 # the chunker_type on the parent model is always there, we just need to add it to the properties 136 properties["chunker_type"] = data["chunker_type"] 137 data["properties"] = properties 138 return data 139 140 @model_validator(mode="after") 141 def ensure_chunker_type_matches_properties(self): 142 # sanity check to ensure the chunker_type matches the properties chunker_type 143 if self.chunker_type != self.properties["chunker_type"]: 144 raise ValueError( 145 f"Chunker type mismatch: {self.chunker_type} != {self.properties['chunker_type']}. This is a bug, please report it." 146 ) 147 return self 148 149 # expose the typed properties based on the chunker_type 150 @property 151 def semantic_properties(self) -> SemanticChunkerProperties: 152 if self.properties["chunker_type"] != ChunkerType.SEMANTIC: 153 raise ValueError( 154 "Semantic properties are only available for semantic chunker." 155 ) 156 # TypedDict cannot be checked at runtime, so we need to ignore the type check 157 # or cast (but it is currently banned in our linting rules). Better solution 158 # would be discriminated union, but that requires the discriminator to be part 159 # of the properties (not outside on the parent model). 160 return self.properties # type: ignore[return-value] 161 162 @property 163 def fixed_window_properties(self) -> FixedWindowChunkerProperties: 164 if self.properties["chunker_type"] != ChunkerType.FIXED_WINDOW: 165 raise ValueError( 166 "Fixed window properties are only available for fixed window chunker." 167 ) 168 # TypedDict cannot be checked at runtime, so we need to ignore the type check 169 # or cast (but it is currently banned in our linting rules). Better solution 170 # would be discriminated union, but that requires the discriminator to be part 171 # of the properties (not outside on the parent model). 172 return self.properties # type: ignore[return-value] 173 174 175class Chunk(BaseModel): 176 """A single chunk of a document, stored as a file attachment.""" 177 178 content: KilnAttachmentModel = Field( 179 description="The content of the chunk, stored as an attachment." 180 ) 181 182 @field_serializer("content") 183 def serialize_content( 184 self, content: KilnAttachmentModel, info: SerializationInfo 185 ) -> dict: 186 context = info.context or {} 187 context["filename_prefix"] = "content" 188 return content.model_dump(mode="json", context=context) 189 190 191class ChunkedDocument( 192 KilnParentedModel, KilnParentModel, parent_of={"chunk_embeddings": ChunkEmbeddings} 193): 194 """A document that has been chunked, storing the resulting chunks.""" 195 196 chunker_config_id: ID_TYPE = Field( 197 description="The ID of the chunker config used to chunk the document.", 198 ) 199 chunks: List[Chunk] = Field(description="The chunks of the document.") 200 201 def parent_extraction(self) -> Union["Extraction", None]: 202 if self.parent is None or self.parent.__class__.__name__ != "Extraction": 203 return None 204 return self.parent # type: ignore 205 206 def chunk_embeddings(self, readonly: bool = False) -> list[ChunkEmbeddings]: 207 return super().chunk_embeddings(readonly=readonly) # type: ignore 208 209 async def load_chunks_text(self) -> list[str]: 210 """Utility to return a list of text for each chunk, loaded from each chunk's content attachment.""" 211 if not self.path: 212 raise ValueError( 213 "Failed to resolve the path of chunk content attachment because the chunk does not have a path." 214 ) 215 216 chunks_text: list[str] = [] 217 for chunk in self.chunks: 218 full_path = chunk.content.resolve_path(self.path.parent) 219 220 try: 221 chunks_text.append( 222 await anyio.Path(full_path).read_text(encoding="utf-8") 223 ) 224 except Exception as e: 225 raise ValueError( 226 f"Failed to read chunk content for {full_path}: {e}" 227 ) from e 228 229 return chunks_text
36class ChunkerType(str, Enum): 37 """The type of chunking algorithm to use.""" 38 39 FIXED_WINDOW = "fixed_window" 40 SEMANTIC = "semantic"
The type of chunking algorithm to use.
43class SemanticChunkerProperties(TypedDict, total=True): 44 chunker_type: Literal[ChunkerType.SEMANTIC] 45 embedding_config_id: str 46 buffer_size: PositiveInt 47 breakpoint_percentile_threshold: NonNegativeInt 48 include_metadata: bool 49 include_prev_next_rel: bool
52class FixedWindowChunkerProperties(TypedDict, total=True): 53 chunker_type: Literal[ChunkerType.FIXED_WINDOW] 54 chunk_overlap: NonNegativeInt 55 chunk_size: PositiveInt
58def validate_fixed_window_chunker_properties( 59 properties: FixedWindowChunkerProperties, 60) -> FixedWindowChunkerProperties: 61 """Validate the properties for the fixed window chunker and set defaults if needed.""" 62 # the typed dict only validates the shape and types, but not the logic, so we validate here 63 if properties["chunk_overlap"] >= properties["chunk_size"]: 64 raise ValueError("Chunk overlap must be less than chunk size.") 65 66 return properties
Validate the properties for the fixed window chunker and set defaults if needed.
69def validate_semantic_chunker_properties( 70 properties: SemanticChunkerProperties, 71) -> SemanticChunkerProperties: 72 """Validate the properties for the semantic chunker.""" 73 buffer_size = properties["buffer_size"] 74 if buffer_size < 1: 75 raise ValueError("buffer_size must be greater than or equal to 1.") 76 77 breakpoint_percentile_threshold = properties["breakpoint_percentile_threshold"] 78 if not (0 <= breakpoint_percentile_threshold <= 100): 79 raise ValueError("breakpoint_percentile_threshold must be between 0 and 100.") 80 81 return properties
Validate the properties for the semantic chunker.
95class ChunkerConfig(KilnParentedModel): 96 """Configuration for chunking extracted documents into smaller pieces.""" 97 98 name: FilenameString = Field( 99 description="A name to identify the chunker config.", 100 ) 101 description: str | None = Field( 102 default=None, description="The description of the chunker config" 103 ) 104 chunker_type: ChunkerType = Field( 105 description="This is used to determine the type of chunker to use.", 106 ) 107 properties: ( 108 SemanticChunkerPropertiesValidator | FixedWindowChunkerPropertiesValidator 109 ) = Field( 110 description="Properties to be used to execute the chunker config. This is chunker_type specific and should serialize to a json dict.", 111 discriminator="chunker_type", 112 ) 113 114 # Workaround to return typed parent without importing Project 115 def parent_project(self) -> Union["Project", None]: 116 if self.parent is None or self.parent.__class__.__name__ != "Project": 117 return None 118 return self.parent # type: ignore 119 120 @model_validator(mode="before") 121 def upgrade_missing_discriminator_properties( 122 cls, data: dict, info: ValidationInfo 123 ) -> dict: 124 if not info.context or not info.context.get("loading_from_file", False): 125 # Not loading from file, so no need to upgrade 126 return data 127 128 if not isinstance(data, dict): 129 return data 130 131 # backward compatibility: 132 # - we originally did not have the chunker_type in the properties, so we need to add it here 133 # - we started wanted to have chunker_type in the properties to use pydantic's discriminated union feature 134 properties = data.get("properties", {}) 135 if "chunker_type" not in properties: 136 # the chunker_type on the parent model is always there, we just need to add it to the properties 137 properties["chunker_type"] = data["chunker_type"] 138 data["properties"] = properties 139 return data 140 141 @model_validator(mode="after") 142 def ensure_chunker_type_matches_properties(self): 143 # sanity check to ensure the chunker_type matches the properties chunker_type 144 if self.chunker_type != self.properties["chunker_type"]: 145 raise ValueError( 146 f"Chunker type mismatch: {self.chunker_type} != {self.properties['chunker_type']}. This is a bug, please report it." 147 ) 148 return self 149 150 # expose the typed properties based on the chunker_type 151 @property 152 def semantic_properties(self) -> SemanticChunkerProperties: 153 if self.properties["chunker_type"] != ChunkerType.SEMANTIC: 154 raise ValueError( 155 "Semantic properties are only available for semantic chunker." 156 ) 157 # TypedDict cannot be checked at runtime, so we need to ignore the type check 158 # or cast (but it is currently banned in our linting rules). Better solution 159 # would be discriminated union, but that requires the discriminator to be part 160 # of the properties (not outside on the parent model). 161 return self.properties # type: ignore[return-value] 162 163 @property 164 def fixed_window_properties(self) -> FixedWindowChunkerProperties: 165 if self.properties["chunker_type"] != ChunkerType.FIXED_WINDOW: 166 raise ValueError( 167 "Fixed window properties are only available for fixed window chunker." 168 ) 169 # TypedDict cannot be checked at runtime, so we need to ignore the type check 170 # or cast (but it is currently banned in our linting rules). Better solution 171 # would be discriminated union, but that requires the discriminator to be part 172 # of the properties (not outside on the parent model). 173 return self.properties # type: ignore[return-value]
Configuration for chunking extracted documents into smaller pieces.
120 @model_validator(mode="before") 121 def upgrade_missing_discriminator_properties( 122 cls, data: dict, info: ValidationInfo 123 ) -> dict: 124 if not info.context or not info.context.get("loading_from_file", False): 125 # Not loading from file, so no need to upgrade 126 return data 127 128 if not isinstance(data, dict): 129 return data 130 131 # backward compatibility: 132 # - we originally did not have the chunker_type in the properties, so we need to add it here 133 # - we started wanted to have chunker_type in the properties to use pydantic's discriminated union feature 134 properties = data.get("properties", {}) 135 if "chunker_type" not in properties: 136 # the chunker_type on the parent model is always there, we just need to add it to the properties 137 properties["chunker_type"] = data["chunker_type"] 138 data["properties"] = properties 139 return data
141 @model_validator(mode="after") 142 def ensure_chunker_type_matches_properties(self): 143 # sanity check to ensure the chunker_type matches the properties chunker_type 144 if self.chunker_type != self.properties["chunker_type"]: 145 raise ValueError( 146 f"Chunker type mismatch: {self.chunker_type} != {self.properties['chunker_type']}. This is a bug, please report it." 147 ) 148 return self
151 @property 152 def semantic_properties(self) -> SemanticChunkerProperties: 153 if self.properties["chunker_type"] != ChunkerType.SEMANTIC: 154 raise ValueError( 155 "Semantic properties are only available for semantic chunker." 156 ) 157 # TypedDict cannot be checked at runtime, so we need to ignore the type check 158 # or cast (but it is currently banned in our linting rules). Better solution 159 # would be discriminated union, but that requires the discriminator to be part 160 # of the properties (not outside on the parent model). 161 return self.properties # type: ignore[return-value]
163 @property 164 def fixed_window_properties(self) -> FixedWindowChunkerProperties: 165 if self.properties["chunker_type"] != ChunkerType.FIXED_WINDOW: 166 raise ValueError( 167 "Fixed window properties are only available for fixed window chunker." 168 ) 169 # TypedDict cannot be checked at runtime, so we need to ignore the type check 170 # or cast (but it is currently banned in our linting rules). Better solution 171 # would be discriminated union, but that requires the discriminator to be part 172 # of the properties (not outside on the parent model). 173 return self.properties # type: ignore[return-value]
The type of the None singleton.
Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].
337def init_private_attributes(self: BaseModel, context: Any, /) -> None: 338 """This function is meant to behave like a BaseModel method to initialise private attributes. 339 340 It takes context as an argument since that's what pydantic-core passes when calling it. 341 342 Args: 343 self: The BaseModel instance. 344 context: The context. 345 """ 346 if getattr(self, '__pydantic_private__', None) is None: 347 pydantic_private = {} 348 for name, private_attr in self.__private_attributes__.items(): 349 default = private_attr.get_default() 350 if default is not PydanticUndefined: 351 pydantic_private[name] = default 352 object_setattr(self, '__pydantic_private__', pydantic_private)
This function is meant to behave like a BaseModel method to initialise private attributes.
It takes context as an argument since that's what pydantic-core passes when calling it.
Args: self: The BaseModel instance. context: The context.
176class Chunk(BaseModel): 177 """A single chunk of a document, stored as a file attachment.""" 178 179 content: KilnAttachmentModel = Field( 180 description="The content of the chunk, stored as an attachment." 181 ) 182 183 @field_serializer("content") 184 def serialize_content( 185 self, content: KilnAttachmentModel, info: SerializationInfo 186 ) -> dict: 187 context = info.context or {} 188 context["filename_prefix"] = "content" 189 return content.model_dump(mode="json", context=context)
A single chunk of a document, stored as a file attachment.
192class ChunkedDocument( 193 KilnParentedModel, KilnParentModel, parent_of={"chunk_embeddings": ChunkEmbeddings} 194): 195 """A document that has been chunked, storing the resulting chunks.""" 196 197 chunker_config_id: ID_TYPE = Field( 198 description="The ID of the chunker config used to chunk the document.", 199 ) 200 chunks: List[Chunk] = Field(description="The chunks of the document.") 201 202 def parent_extraction(self) -> Union["Extraction", None]: 203 if self.parent is None or self.parent.__class__.__name__ != "Extraction": 204 return None 205 return self.parent # type: ignore 206 207 def chunk_embeddings(self, readonly: bool = False) -> list[ChunkEmbeddings]: 208 return super().chunk_embeddings(readonly=readonly) # type: ignore 209 210 async def load_chunks_text(self) -> list[str]: 211 """Utility to return a list of text for each chunk, loaded from each chunk's content attachment.""" 212 if not self.path: 213 raise ValueError( 214 "Failed to resolve the path of chunk content attachment because the chunk does not have a path." 215 ) 216 217 chunks_text: list[str] = [] 218 for chunk in self.chunks: 219 full_path = chunk.content.resolve_path(self.path.parent) 220 221 try: 222 chunks_text.append( 223 await anyio.Path(full_path).read_text(encoding="utf-8") 224 ) 225 except Exception as e: 226 raise ValueError( 227 f"Failed to read chunk content for {full_path}: {e}" 228 ) from e 229 230 return chunks_text
A document that has been chunked, storing the resulting chunks.
743 def child_method(self, readonly: bool = False) -> list[child_class]: # type: ignore[invalid-type-form] 744 return child_class.all_children_of_parent_path(self.path, readonly=readonly)
The type of the None singleton.
210 async def load_chunks_text(self) -> list[str]: 211 """Utility to return a list of text for each chunk, loaded from each chunk's content attachment.""" 212 if not self.path: 213 raise ValueError( 214 "Failed to resolve the path of chunk content attachment because the chunk does not have a path." 215 ) 216 217 chunks_text: list[str] = [] 218 for chunk in self.chunks: 219 full_path = chunk.content.resolve_path(self.path.parent) 220 221 try: 222 chunks_text.append( 223 await anyio.Path(full_path).read_text(encoding="utf-8") 224 ) 225 except Exception as e: 226 raise ValueError( 227 f"Failed to read chunk content for {full_path}: {e}" 228 ) from e 229 230 return chunks_text
Utility to return a list of text for each chunk, loaded from each chunk's content attachment.
The type of the None singleton.
Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].
337def init_private_attributes(self: BaseModel, context: Any, /) -> None: 338 """This function is meant to behave like a BaseModel method to initialise private attributes. 339 340 It takes context as an argument since that's what pydantic-core passes when calling it. 341 342 Args: 343 self: The BaseModel instance. 344 context: The context. 345 """ 346 if getattr(self, '__pydantic_private__', None) is None: 347 pydantic_private = {} 348 for name, private_attr in self.__private_attributes__.items(): 349 default = private_attr.get_default() 350 if default is not PydanticUndefined: 351 pydantic_private[name] = default 352 object_setattr(self, '__pydantic_private__', pydantic_private)
This function is meant to behave like a BaseModel method to initialise private attributes.
It takes context as an argument since that's what pydantic-core passes when calling it.
Args: self: The BaseModel instance. context: The context.