kiln_ai.datamodel.chunk
1import logging 2from enum import Enum 3from typing import TYPE_CHECKING, Annotated, List, Union 4 5import anyio 6from pydantic import ( 7 AfterValidator, 8 BaseModel, 9 Field, 10 NonNegativeInt, 11 PositiveInt, 12 SerializationInfo, 13 ValidationInfo, 14 field_serializer, 15 model_validator, 16) 17from typing_extensions import Literal, TypedDict 18 19from kiln_ai.datamodel.basemodel import ( 20 ID_TYPE, 21 FilenameString, 22 KilnAttachmentModel, 23 KilnParentedModel, 24 KilnParentModel, 25) 26from kiln_ai.datamodel.embedding import ChunkEmbeddings 27 28logger = logging.getLogger(__name__) 29 30if TYPE_CHECKING: 31 from kiln_ai.datamodel.extraction import Extraction 32 from kiln_ai.datamodel.project import Project 33 34 35class ChunkerType(str, Enum): 36 FIXED_WINDOW = "fixed_window" 37 SEMANTIC = "semantic" 38 39 40class SemanticChunkerProperties(TypedDict, total=True): 41 chunker_type: Literal[ChunkerType.SEMANTIC] 42 embedding_config_id: str 43 buffer_size: PositiveInt 44 breakpoint_percentile_threshold: NonNegativeInt 45 include_metadata: bool 46 include_prev_next_rel: bool 47 48 49class FixedWindowChunkerProperties(TypedDict, total=True): 50 chunker_type: Literal[ChunkerType.FIXED_WINDOW] 51 chunk_overlap: NonNegativeInt 52 chunk_size: PositiveInt 53 54 55def validate_fixed_window_chunker_properties( 56 properties: FixedWindowChunkerProperties, 57) -> FixedWindowChunkerProperties: 58 """Validate the properties for the fixed window chunker and set defaults if needed.""" 59 # the typed dict only validates the shape and types, but not the logic, so we validate here 60 if properties["chunk_overlap"] >= properties["chunk_size"]: 61 raise ValueError("Chunk overlap must be less than chunk size.") 62 63 return properties 64 65 66def validate_semantic_chunker_properties( 67 properties: SemanticChunkerProperties, 68) -> SemanticChunkerProperties: 69 """Validate the properties for the semantic chunker.""" 70 buffer_size = properties["buffer_size"] 71 if buffer_size < 1: 72 raise ValueError("buffer_size must be greater than or equal to 1.") 73 74 breakpoint_percentile_threshold = properties["breakpoint_percentile_threshold"] 75 if not (0 <= breakpoint_percentile_threshold <= 100): 76 raise ValueError("breakpoint_percentile_threshold must be between 0 and 100.") 77 78 return properties 79 80 81SemanticChunkerPropertiesValidator = Annotated[ 82 SemanticChunkerProperties, 83 AfterValidator(lambda v: validate_semantic_chunker_properties(v)), 84] 85 86FixedWindowChunkerPropertiesValidator = Annotated[ 87 FixedWindowChunkerProperties, 88 AfterValidator(lambda v: validate_fixed_window_chunker_properties(v)), 89] 90 91 92class ChunkerConfig(KilnParentedModel): 93 name: FilenameString = Field( 94 description="A name to identify the chunker config.", 95 ) 96 description: str | None = Field( 97 default=None, description="The description of the chunker config" 98 ) 99 chunker_type: ChunkerType = Field( 100 description="This is used to determine the type of chunker to use.", 101 ) 102 properties: ( 103 SemanticChunkerPropertiesValidator | FixedWindowChunkerPropertiesValidator 104 ) = Field( 105 description="Properties to be used to execute the chunker config. This is chunker_type specific and should serialize to a json dict.", 106 discriminator="chunker_type", 107 ) 108 109 # Workaround to return typed parent without importing Project 110 def parent_project(self) -> Union["Project", None]: 111 if self.parent is None or self.parent.__class__.__name__ != "Project": 112 return None 113 return self.parent # type: ignore 114 115 @model_validator(mode="before") 116 def upgrade_missing_discriminator_properties( 117 cls, data: dict, info: ValidationInfo 118 ) -> dict: 119 if not info.context or not info.context.get("loading_from_file", False): 120 # Not loading from file, so no need to upgrade 121 return data 122 123 if not isinstance(data, dict): 124 return data 125 126 # backward compatibility: 127 # - we originally did not have the chunker_type in the properties, so we need to add it here 128 # - we started wanted to have chunker_type in the properties to use pydantic's discriminated union feature 129 properties = data.get("properties", {}) 130 if "chunker_type" not in properties: 131 # the chunker_type on the parent model is always there, we just need to add it to the properties 132 properties["chunker_type"] = data["chunker_type"] 133 data["properties"] = properties 134 return data 135 136 @model_validator(mode="after") 137 def ensure_chunker_type_matches_properties(self): 138 # sanity check to ensure the chunker_type matches the properties chunker_type 139 if self.chunker_type != self.properties["chunker_type"]: 140 raise ValueError( 141 f"Chunker type mismatch: {self.chunker_type} != {self.properties['chunker_type']}. This is a bug, please report it." 142 ) 143 return self 144 145 # expose the typed properties based on the chunker_type 146 @property 147 def semantic_properties(self) -> SemanticChunkerProperties: 148 if self.properties["chunker_type"] != ChunkerType.SEMANTIC: 149 raise ValueError( 150 "Semantic properties are only available for semantic chunker." 151 ) 152 # TypedDict cannot be checked at runtime, so we need to ignore the type check 153 # or cast (but it is currently banned in our linting rules). Better solution 154 # would be discriminated union, but that requires the discriminator to be part 155 # of the properties (not outside on the parent model). 156 return self.properties 157 158 @property 159 def fixed_window_properties(self) -> FixedWindowChunkerProperties: 160 if self.properties["chunker_type"] != ChunkerType.FIXED_WINDOW: 161 raise ValueError( 162 "Fixed window properties are only available for fixed window chunker." 163 ) 164 # TypedDict cannot be checked at runtime, so we need to ignore the type check 165 # or cast (but it is currently banned in our linting rules). Better solution 166 # would be discriminated union, but that requires the discriminator to be part 167 # of the properties (not outside on the parent model). 168 return self.properties 169 170 171class Chunk(BaseModel): 172 content: KilnAttachmentModel = Field( 173 description="The content of the chunk, stored as an attachment." 174 ) 175 176 @field_serializer("content") 177 def serialize_content( 178 self, content: KilnAttachmentModel, info: SerializationInfo 179 ) -> dict: 180 context = info.context or {} 181 context["filename_prefix"] = "content" 182 return content.model_dump(mode="json", context=context) 183 184 185class ChunkedDocument( 186 KilnParentedModel, KilnParentModel, parent_of={"chunk_embeddings": ChunkEmbeddings} 187): 188 chunker_config_id: ID_TYPE = Field( 189 description="The ID of the chunker config used to chunk the document.", 190 ) 191 chunks: List[Chunk] = Field(description="The chunks of the document.") 192 193 def parent_extraction(self) -> Union["Extraction", None]: 194 if self.parent is None or self.parent.__class__.__name__ != "Extraction": 195 return None 196 return self.parent # type: ignore 197 198 def chunk_embeddings(self, readonly: bool = False) -> list[ChunkEmbeddings]: 199 return super().chunk_embeddings(readonly=readonly) # type: ignore 200 201 async def load_chunks_text(self) -> list[str]: 202 """Utility to return a list of text for each chunk, loaded from each chunk's content attachment.""" 203 if not self.path: 204 raise ValueError( 205 "Failed to resolve the path of chunk content attachment because the chunk does not have a path." 206 ) 207 208 chunks_text: list[str] = [] 209 for chunk in self.chunks: 210 full_path = chunk.content.resolve_path(self.path.parent) 211 212 try: 213 chunks_text.append( 214 await anyio.Path(full_path).read_text(encoding="utf-8") 215 ) 216 except Exception as e: 217 raise ValueError( 218 f"Failed to read chunk content for {full_path}: {e}" 219 ) from e 220 221 return chunks_text
str(object='') -> str str(bytes_or_buffer[, encoding[, errors]]) -> str
Create a new string object from the given object. If encoding or errors is specified, then the object must expose a data buffer that will be decoded using the given encoding and error handler. Otherwise, returns the result of object.__str__() (if defined) or repr(object). encoding defaults to 'utf-8'. errors defaults to 'strict'.
41class SemanticChunkerProperties(TypedDict, total=True): 42 chunker_type: Literal[ChunkerType.SEMANTIC] 43 embedding_config_id: str 44 buffer_size: PositiveInt 45 breakpoint_percentile_threshold: NonNegativeInt 46 include_metadata: bool 47 include_prev_next_rel: bool
50class FixedWindowChunkerProperties(TypedDict, total=True): 51 chunker_type: Literal[ChunkerType.FIXED_WINDOW] 52 chunk_overlap: NonNegativeInt 53 chunk_size: PositiveInt
56def validate_fixed_window_chunker_properties( 57 properties: FixedWindowChunkerProperties, 58) -> FixedWindowChunkerProperties: 59 """Validate the properties for the fixed window chunker and set defaults if needed.""" 60 # the typed dict only validates the shape and types, but not the logic, so we validate here 61 if properties["chunk_overlap"] >= properties["chunk_size"]: 62 raise ValueError("Chunk overlap must be less than chunk size.") 63 64 return properties
Validate the properties for the fixed window chunker and set defaults if needed.
67def validate_semantic_chunker_properties( 68 properties: SemanticChunkerProperties, 69) -> SemanticChunkerProperties: 70 """Validate the properties for the semantic chunker.""" 71 buffer_size = properties["buffer_size"] 72 if buffer_size < 1: 73 raise ValueError("buffer_size must be greater than or equal to 1.") 74 75 breakpoint_percentile_threshold = properties["breakpoint_percentile_threshold"] 76 if not (0 <= breakpoint_percentile_threshold <= 100): 77 raise ValueError("breakpoint_percentile_threshold must be between 0 and 100.") 78 79 return properties
Validate the properties for the semantic chunker.
93class ChunkerConfig(KilnParentedModel): 94 name: FilenameString = Field( 95 description="A name to identify the chunker config.", 96 ) 97 description: str | None = Field( 98 default=None, description="The description of the chunker config" 99 ) 100 chunker_type: ChunkerType = Field( 101 description="This is used to determine the type of chunker to use.", 102 ) 103 properties: ( 104 SemanticChunkerPropertiesValidator | FixedWindowChunkerPropertiesValidator 105 ) = Field( 106 description="Properties to be used to execute the chunker config. This is chunker_type specific and should serialize to a json dict.", 107 discriminator="chunker_type", 108 ) 109 110 # Workaround to return typed parent without importing Project 111 def parent_project(self) -> Union["Project", None]: 112 if self.parent is None or self.parent.__class__.__name__ != "Project": 113 return None 114 return self.parent # type: ignore 115 116 @model_validator(mode="before") 117 def upgrade_missing_discriminator_properties( 118 cls, data: dict, info: ValidationInfo 119 ) -> dict: 120 if not info.context or not info.context.get("loading_from_file", False): 121 # Not loading from file, so no need to upgrade 122 return data 123 124 if not isinstance(data, dict): 125 return data 126 127 # backward compatibility: 128 # - we originally did not have the chunker_type in the properties, so we need to add it here 129 # - we started wanted to have chunker_type in the properties to use pydantic's discriminated union feature 130 properties = data.get("properties", {}) 131 if "chunker_type" not in properties: 132 # the chunker_type on the parent model is always there, we just need to add it to the properties 133 properties["chunker_type"] = data["chunker_type"] 134 data["properties"] = properties 135 return data 136 137 @model_validator(mode="after") 138 def ensure_chunker_type_matches_properties(self): 139 # sanity check to ensure the chunker_type matches the properties chunker_type 140 if self.chunker_type != self.properties["chunker_type"]: 141 raise ValueError( 142 f"Chunker type mismatch: {self.chunker_type} != {self.properties['chunker_type']}. This is a bug, please report it." 143 ) 144 return self 145 146 # expose the typed properties based on the chunker_type 147 @property 148 def semantic_properties(self) -> SemanticChunkerProperties: 149 if self.properties["chunker_type"] != ChunkerType.SEMANTIC: 150 raise ValueError( 151 "Semantic properties are only available for semantic chunker." 152 ) 153 # TypedDict cannot be checked at runtime, so we need to ignore the type check 154 # or cast (but it is currently banned in our linting rules). Better solution 155 # would be discriminated union, but that requires the discriminator to be part 156 # of the properties (not outside on the parent model). 157 return self.properties 158 159 @property 160 def fixed_window_properties(self) -> FixedWindowChunkerProperties: 161 if self.properties["chunker_type"] != ChunkerType.FIXED_WINDOW: 162 raise ValueError( 163 "Fixed window properties are only available for fixed window chunker." 164 ) 165 # TypedDict cannot be checked at runtime, so we need to ignore the type check 166 # or cast (but it is currently banned in our linting rules). Better solution 167 # would be discriminated union, but that requires the discriminator to be part 168 # of the properties (not outside on the parent model). 169 return self.properties
Base model for Kiln models that have a parent-child relationship. This base class is for child models.
This class provides functionality for managing hierarchical relationships between models, including parent reference handling and file system organization.
Attributes: parent (KilnBaseModel): Reference to the parent model instance. Not persisted, just in memory.
116 @model_validator(mode="before") 117 def upgrade_missing_discriminator_properties( 118 cls, data: dict, info: ValidationInfo 119 ) -> dict: 120 if not info.context or not info.context.get("loading_from_file", False): 121 # Not loading from file, so no need to upgrade 122 return data 123 124 if not isinstance(data, dict): 125 return data 126 127 # backward compatibility: 128 # - we originally did not have the chunker_type in the properties, so we need to add it here 129 # - we started wanted to have chunker_type in the properties to use pydantic's discriminated union feature 130 properties = data.get("properties", {}) 131 if "chunker_type" not in properties: 132 # the chunker_type on the parent model is always there, we just need to add it to the properties 133 properties["chunker_type"] = data["chunker_type"] 134 data["properties"] = properties 135 return data
137 @model_validator(mode="after") 138 def ensure_chunker_type_matches_properties(self): 139 # sanity check to ensure the chunker_type matches the properties chunker_type 140 if self.chunker_type != self.properties["chunker_type"]: 141 raise ValueError( 142 f"Chunker type mismatch: {self.chunker_type} != {self.properties['chunker_type']}. This is a bug, please report it." 143 ) 144 return self
147 @property 148 def semantic_properties(self) -> SemanticChunkerProperties: 149 if self.properties["chunker_type"] != ChunkerType.SEMANTIC: 150 raise ValueError( 151 "Semantic properties are only available for semantic chunker." 152 ) 153 # TypedDict cannot be checked at runtime, so we need to ignore the type check 154 # or cast (but it is currently banned in our linting rules). Better solution 155 # would be discriminated union, but that requires the discriminator to be part 156 # of the properties (not outside on the parent model). 157 return self.properties
159 @property 160 def fixed_window_properties(self) -> FixedWindowChunkerProperties: 161 if self.properties["chunker_type"] != ChunkerType.FIXED_WINDOW: 162 raise ValueError( 163 "Fixed window properties are only available for fixed window chunker." 164 ) 165 # TypedDict cannot be checked at runtime, so we need to ignore the type check 166 # or cast (but it is currently banned in our linting rules). Better solution 167 # would be discriminated union, but that requires the discriminator to be part 168 # of the properties (not outside on the parent model). 169 return self.properties
The type of the None singleton.
Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].
337def init_private_attributes(self: BaseModel, context: Any, /) -> None: 338 """This function is meant to behave like a BaseModel method to initialise private attributes. 339 340 It takes context as an argument since that's what pydantic-core passes when calling it. 341 342 Args: 343 self: The BaseModel instance. 344 context: The context. 345 """ 346 if getattr(self, '__pydantic_private__', None) is None: 347 pydantic_private = {} 348 for name, private_attr in self.__private_attributes__.items(): 349 default = private_attr.get_default() 350 if default is not PydanticUndefined: 351 pydantic_private[name] = default 352 object_setattr(self, '__pydantic_private__', pydantic_private)
This function is meant to behave like a BaseModel method to initialise private attributes.
It takes context as an argument since that's what pydantic-core passes when calling it.
Args: self: The BaseModel instance. context: The context.
172class Chunk(BaseModel): 173 content: KilnAttachmentModel = Field( 174 description="The content of the chunk, stored as an attachment." 175 ) 176 177 @field_serializer("content") 178 def serialize_content( 179 self, content: KilnAttachmentModel, info: SerializationInfo 180 ) -> dict: 181 context = info.context or {} 182 context["filename_prefix"] = "content" 183 return content.model_dump(mode="json", context=context)
!!! abstract "Usage Documentation" Models
A base class for creating Pydantic models.
Attributes:
__class_vars__: The names of the class variables defined on the model.
__private_attributes__: Metadata about the private attributes of the model.
__signature__: The synthesized __init__ [Signature][inspect.Signature] of the model.
__pydantic_complete__: Whether model building is completed, or if there are still undefined fields.
__pydantic_core_schema__: The core schema of the model.
__pydantic_custom_init__: Whether the model has a custom `__init__` function.
__pydantic_decorators__: Metadata containing the decorators defined on the model.
This replaces `Model.__validators__` and `Model.__root_validators__` from Pydantic V1.
__pydantic_generic_metadata__: Metadata for generic models; contains data used for a similar purpose to
__args__, __origin__, __parameters__ in typing-module generics. May eventually be replaced by these.
__pydantic_parent_namespace__: Parent namespace of the model, used for automatic rebuilding of models.
__pydantic_post_init__: The name of the post-init method for the model, if defined.
__pydantic_root_model__: Whether the model is a [`RootModel`][pydantic.root_model.RootModel].
__pydantic_serializer__: The `pydantic-core` `SchemaSerializer` used to dump instances of the model.
__pydantic_validator__: The `pydantic-core` `SchemaValidator` used to validate instances of the model.
__pydantic_fields__: A dictionary of field names and their corresponding [`FieldInfo`][pydantic.fields.FieldInfo] objects.
__pydantic_computed_fields__: A dictionary of computed field names and their corresponding [`ComputedFieldInfo`][pydantic.fields.ComputedFieldInfo] objects.
__pydantic_extra__: A dictionary containing extra values, if [`extra`][pydantic.config.ConfigDict.extra]
is set to `'allow'`.
__pydantic_fields_set__: The names of fields explicitly set during instantiation.
__pydantic_private__: Values of private attributes set on the model instance.
186class ChunkedDocument( 187 KilnParentedModel, KilnParentModel, parent_of={"chunk_embeddings": ChunkEmbeddings} 188): 189 chunker_config_id: ID_TYPE = Field( 190 description="The ID of the chunker config used to chunk the document.", 191 ) 192 chunks: List[Chunk] = Field(description="The chunks of the document.") 193 194 def parent_extraction(self) -> Union["Extraction", None]: 195 if self.parent is None or self.parent.__class__.__name__ != "Extraction": 196 return None 197 return self.parent # type: ignore 198 199 def chunk_embeddings(self, readonly: bool = False) -> list[ChunkEmbeddings]: 200 return super().chunk_embeddings(readonly=readonly) # type: ignore 201 202 async def load_chunks_text(self) -> list[str]: 203 """Utility to return a list of text for each chunk, loaded from each chunk's content attachment.""" 204 if not self.path: 205 raise ValueError( 206 "Failed to resolve the path of chunk content attachment because the chunk does not have a path." 207 ) 208 209 chunks_text: list[str] = [] 210 for chunk in self.chunks: 211 full_path = chunk.content.resolve_path(self.path.parent) 212 213 try: 214 chunks_text.append( 215 await anyio.Path(full_path).read_text(encoding="utf-8") 216 ) 217 except Exception as e: 218 raise ValueError( 219 f"Failed to read chunk content for {full_path}: {e}" 220 ) from e 221 222 return chunks_text
Base model for Kiln models that have a parent-child relationship. This base class is for child models.
This class provides functionality for managing hierarchical relationships between models, including parent reference handling and file system organization.
Attributes: parent (KilnBaseModel): Reference to the parent model instance. Not persisted, just in memory.
695 def child_method(self, readonly: bool = False) -> list[child_class]: 696 return child_class.all_children_of_parent_path(self.path, readonly=readonly)
The type of the None singleton.
202 async def load_chunks_text(self) -> list[str]: 203 """Utility to return a list of text for each chunk, loaded from each chunk's content attachment.""" 204 if not self.path: 205 raise ValueError( 206 "Failed to resolve the path of chunk content attachment because the chunk does not have a path." 207 ) 208 209 chunks_text: list[str] = [] 210 for chunk in self.chunks: 211 full_path = chunk.content.resolve_path(self.path.parent) 212 213 try: 214 chunks_text.append( 215 await anyio.Path(full_path).read_text(encoding="utf-8") 216 ) 217 except Exception as e: 218 raise ValueError( 219 f"Failed to read chunk content for {full_path}: {e}" 220 ) from e 221 222 return chunks_text
Utility to return a list of text for each chunk, loaded from each chunk's content attachment.
The type of the None singleton.
Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].
337def init_private_attributes(self: BaseModel, context: Any, /) -> None: 338 """This function is meant to behave like a BaseModel method to initialise private attributes. 339 340 It takes context as an argument since that's what pydantic-core passes when calling it. 341 342 Args: 343 self: The BaseModel instance. 344 context: The context. 345 """ 346 if getattr(self, '__pydantic_private__', None) is None: 347 pydantic_private = {} 348 for name, private_attr in self.__private_attributes__.items(): 349 default = private_attr.get_default() 350 if default is not PydanticUndefined: 351 pydantic_private[name] = default 352 object_setattr(self, '__pydantic_private__', pydantic_private)
This function is meant to behave like a BaseModel method to initialise private attributes.
It takes context as an argument since that's what pydantic-core passes when calling it.
Args: self: The BaseModel instance. context: The context.