kiln_ai.datamodel.chunk

  1import logging
  2from enum import Enum
  3from typing import TYPE_CHECKING, List, Union
  4
  5import anyio
  6from pydantic import (
  7    BaseModel,
  8    Field,
  9    SerializationInfo,
 10    ValidationInfo,
 11    field_serializer,
 12    field_validator,
 13)
 14
 15from kiln_ai.datamodel.basemodel import (
 16    ID_TYPE,
 17    FilenameString,
 18    KilnAttachmentModel,
 19    KilnParentedModel,
 20    KilnParentModel,
 21)
 22from kiln_ai.datamodel.embedding import ChunkEmbeddings
 23
 24logger = logging.getLogger(__name__)
 25
 26if TYPE_CHECKING:
 27    from kiln_ai.datamodel.extraction import Extraction
 28    from kiln_ai.datamodel.project import Project
 29
 30
 31def validate_fixed_window_chunker_properties(
 32    properties: dict[str, str | int | float | bool],
 33) -> dict[str, str | int | float | bool]:
 34    """Validate the properties for the fixed window chunker and set defaults if needed."""
 35    chunk_overlap = properties.get("chunk_overlap")
 36    if chunk_overlap is None:
 37        raise ValueError("Chunk overlap is required.")
 38
 39    chunk_size = properties.get("chunk_size")
 40    if chunk_size is None:
 41        raise ValueError("Chunk size is required.")
 42
 43    if not isinstance(chunk_overlap, int):
 44        raise ValueError("Chunk overlap must be an integer.")
 45    if chunk_overlap < 0:
 46        raise ValueError("Chunk overlap must be greater than or equal to 0.")
 47
 48    if not isinstance(chunk_size, int):
 49        raise ValueError("Chunk size must be an integer.")
 50    if chunk_size <= 0:
 51        raise ValueError("Chunk size must be greater than 0.")
 52
 53    if chunk_overlap >= chunk_size:
 54        raise ValueError("Chunk overlap must be less than chunk size.")
 55
 56    return properties
 57
 58
 59class ChunkerType(str, Enum):
 60    FIXED_WINDOW = "fixed_window"
 61
 62
 63class ChunkerConfig(KilnParentedModel):
 64    name: FilenameString = Field(
 65        description="A name to identify the chunker config.",
 66    )
 67    description: str | None = Field(
 68        default=None, description="The description of the chunker config"
 69    )
 70    chunker_type: ChunkerType = Field(
 71        description="This is used to determine the type of chunker to use.",
 72    )
 73    properties: dict[str, str | int | float | bool] = Field(
 74        description="Properties to be used to execute the chunker config. This is chunker_type specific and should serialize to a json dict.",
 75    )
 76
 77    # Workaround to return typed parent without importing Project
 78    def parent_project(self) -> Union["Project", None]:
 79        if self.parent is None or self.parent.__class__.__name__ != "Project":
 80            return None
 81        return self.parent  # type: ignore
 82
 83    @field_validator("properties")
 84    @classmethod
 85    def validate_properties(
 86        cls, properties: dict[str, str | int | float | bool], info: ValidationInfo
 87    ) -> dict[str, str | int | float | bool]:
 88        if info.data.get("chunker_type") == ChunkerType.FIXED_WINDOW:
 89            # do not trigger revalidation of properties
 90            return validate_fixed_window_chunker_properties(properties)
 91        return properties
 92
 93    def chunk_size(self) -> int | None:
 94        if self.properties.get("chunk_size") is None:
 95            return None
 96        if not isinstance(self.properties["chunk_size"], int):
 97            raise ValueError("Chunk size must be an integer.")
 98        return self.properties["chunk_size"]
 99
100    def chunk_overlap(self) -> int | None:
101        if self.properties.get("chunk_overlap") is None:
102            return None
103        if not isinstance(self.properties["chunk_overlap"], int):
104            raise ValueError("Chunk overlap must be an integer.")
105        return self.properties["chunk_overlap"]
106
107
108class Chunk(BaseModel):
109    content: KilnAttachmentModel = Field(
110        description="The content of the chunk, stored as an attachment."
111    )
112
113    @field_serializer("content")
114    def serialize_content(
115        self, content: KilnAttachmentModel, info: SerializationInfo
116    ) -> dict:
117        context = info.context or {}
118        context["filename_prefix"] = "content"
119        return content.model_dump(mode="json", context=context)
120
121
122class ChunkedDocument(
123    KilnParentedModel, KilnParentModel, parent_of={"chunk_embeddings": ChunkEmbeddings}
124):
125    chunker_config_id: ID_TYPE = Field(
126        description="The ID of the chunker config used to chunk the document.",
127    )
128    chunks: List[Chunk] = Field(description="The chunks of the document.")
129
130    def parent_extraction(self) -> Union["Extraction", None]:
131        if self.parent is None or self.parent.__class__.__name__ != "Extraction":
132            return None
133        return self.parent  # type: ignore
134
135    def chunk_embeddings(self, readonly: bool = False) -> list[ChunkEmbeddings]:
136        return super().chunk_embeddings(readonly=readonly)  # type: ignore
137
138    async def load_chunks_text(self) -> list[str]:
139        """Utility to return a list of text for each chunk, loaded from each chunk's content attachment."""
140        if not self.path:
141            raise ValueError(
142                "Failed to resolve the path of chunk content attachment because the chunk does not have a path."
143            )
144
145        chunks_text: list[str] = []
146        for chunk in self.chunks:
147            full_path = chunk.content.resolve_path(self.path.parent)
148
149            try:
150                chunks_text.append(
151                    await anyio.Path(full_path).read_text(encoding="utf-8")
152                )
153            except Exception as e:
154                raise ValueError(
155                    f"Failed to read chunk content for {full_path}: {e}"
156                ) from e
157
158        return chunks_text
logger = <Logger kiln_ai.datamodel.chunk (WARNING)>
def validate_fixed_window_chunker_properties( properties: dict[str, str | int | float | bool]) -> dict[str, str | int | float | bool]:
32def validate_fixed_window_chunker_properties(
33    properties: dict[str, str | int | float | bool],
34) -> dict[str, str | int | float | bool]:
35    """Validate the properties for the fixed window chunker and set defaults if needed."""
36    chunk_overlap = properties.get("chunk_overlap")
37    if chunk_overlap is None:
38        raise ValueError("Chunk overlap is required.")
39
40    chunk_size = properties.get("chunk_size")
41    if chunk_size is None:
42        raise ValueError("Chunk size is required.")
43
44    if not isinstance(chunk_overlap, int):
45        raise ValueError("Chunk overlap must be an integer.")
46    if chunk_overlap < 0:
47        raise ValueError("Chunk overlap must be greater than or equal to 0.")
48
49    if not isinstance(chunk_size, int):
50        raise ValueError("Chunk size must be an integer.")
51    if chunk_size <= 0:
52        raise ValueError("Chunk size must be greater than 0.")
53
54    if chunk_overlap >= chunk_size:
55        raise ValueError("Chunk overlap must be less than chunk size.")
56
57    return properties

Validate the properties for the fixed window chunker and set defaults if needed.

class ChunkerType(builtins.str, enum.Enum):
60class ChunkerType(str, Enum):
61    FIXED_WINDOW = "fixed_window"

str(object='') -> str str(bytes_or_buffer[, encoding[, errors]]) -> str

Create a new string object from the given object. If encoding or errors is specified, then the object must expose a data buffer that will be decoded using the given encoding and error handler. Otherwise, returns the result of object.__str__() (if defined) or repr(object). encoding defaults to 'utf-8'. errors defaults to 'strict'.

FIXED_WINDOW = <ChunkerType.FIXED_WINDOW: 'fixed_window'>
class ChunkerConfig(kiln_ai.datamodel.basemodel.KilnParentedModel):
 64class ChunkerConfig(KilnParentedModel):
 65    name: FilenameString = Field(
 66        description="A name to identify the chunker config.",
 67    )
 68    description: str | None = Field(
 69        default=None, description="The description of the chunker config"
 70    )
 71    chunker_type: ChunkerType = Field(
 72        description="This is used to determine the type of chunker to use.",
 73    )
 74    properties: dict[str, str | int | float | bool] = Field(
 75        description="Properties to be used to execute the chunker config. This is chunker_type specific and should serialize to a json dict.",
 76    )
 77
 78    # Workaround to return typed parent without importing Project
 79    def parent_project(self) -> Union["Project", None]:
 80        if self.parent is None or self.parent.__class__.__name__ != "Project":
 81            return None
 82        return self.parent  # type: ignore
 83
 84    @field_validator("properties")
 85    @classmethod
 86    def validate_properties(
 87        cls, properties: dict[str, str | int | float | bool], info: ValidationInfo
 88    ) -> dict[str, str | int | float | bool]:
 89        if info.data.get("chunker_type") == ChunkerType.FIXED_WINDOW:
 90            # do not trigger revalidation of properties
 91            return validate_fixed_window_chunker_properties(properties)
 92        return properties
 93
 94    def chunk_size(self) -> int | None:
 95        if self.properties.get("chunk_size") is None:
 96            return None
 97        if not isinstance(self.properties["chunk_size"], int):
 98            raise ValueError("Chunk size must be an integer.")
 99        return self.properties["chunk_size"]
100
101    def chunk_overlap(self) -> int | None:
102        if self.properties.get("chunk_overlap") is None:
103            return None
104        if not isinstance(self.properties["chunk_overlap"], int):
105            raise ValueError("Chunk overlap must be an integer.")
106        return self.properties["chunk_overlap"]

Base model for Kiln models that have a parent-child relationship. This base class is for child models.

This class provides functionality for managing hierarchical relationships between models, including parent reference handling and file system organization.

Attributes: parent (KilnBaseModel): Reference to the parent model instance. Not persisted, just in memory.

name: Annotated[str, BeforeValidator(func=<function name_validator.<locals>.fn at 0x7fc0765f0900>, json_schema_input_type=PydanticUndefined)]
description: str | None
chunker_type: ChunkerType
properties: dict[str, str | int | float | bool]
def parent_project(self) -> Optional[kiln_ai.datamodel.Project]:
79    def parent_project(self) -> Union["Project", None]:
80        if self.parent is None or self.parent.__class__.__name__ != "Project":
81            return None
82        return self.parent  # type: ignore
@field_validator('properties')
@classmethod
def validate_properties( cls, properties: dict[str, str | int | float | bool], info: pydantic_core.core_schema.ValidationInfo) -> dict[str, str | int | float | bool]:
84    @field_validator("properties")
85    @classmethod
86    def validate_properties(
87        cls, properties: dict[str, str | int | float | bool], info: ValidationInfo
88    ) -> dict[str, str | int | float | bool]:
89        if info.data.get("chunker_type") == ChunkerType.FIXED_WINDOW:
90            # do not trigger revalidation of properties
91            return validate_fixed_window_chunker_properties(properties)
92        return properties
def chunk_size(self) -> int | None:
94    def chunk_size(self) -> int | None:
95        if self.properties.get("chunk_size") is None:
96            return None
97        if not isinstance(self.properties["chunk_size"], int):
98            raise ValueError("Chunk size must be an integer.")
99        return self.properties["chunk_size"]
def chunk_overlap(self) -> int | None:
101    def chunk_overlap(self) -> int | None:
102        if self.properties.get("chunk_overlap") is None:
103            return None
104        if not isinstance(self.properties["chunk_overlap"], int):
105            raise ValueError("Chunk overlap must be an integer.")
106        return self.properties["chunk_overlap"]
def relationship_name() -> str:
661        def relationship_name_method() -> str:
662            return relationship_name

The type of the None singleton.

def parent_type() -> Type[kiln_ai.datamodel.basemodel.KilnParentModel]:
654        def parent_class_method() -> Type[KilnParentModel]:
655            return cls

The type of the None singleton.

model_config = {'validate_assignment': True}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

def model_post_init(self: pydantic.main.BaseModel, context: Any, /) -> None:
337def init_private_attributes(self: BaseModel, context: Any, /) -> None:
338    """This function is meant to behave like a BaseModel method to initialise private attributes.
339
340    It takes context as an argument since that's what pydantic-core passes when calling it.
341
342    Args:
343        self: The BaseModel instance.
344        context: The context.
345    """
346    if getattr(self, '__pydantic_private__', None) is None:
347        pydantic_private = {}
348        for name, private_attr in self.__private_attributes__.items():
349            default = private_attr.get_default()
350            if default is not PydanticUndefined:
351                pydantic_private[name] = default
352        object_setattr(self, '__pydantic_private__', pydantic_private)

This function is meant to behave like a BaseModel method to initialise private attributes.

It takes context as an argument since that's what pydantic-core passes when calling it.

Args: self: The BaseModel instance. context: The context.

class Chunk(pydantic.main.BaseModel):
109class Chunk(BaseModel):
110    content: KilnAttachmentModel = Field(
111        description="The content of the chunk, stored as an attachment."
112    )
113
114    @field_serializer("content")
115    def serialize_content(
116        self, content: KilnAttachmentModel, info: SerializationInfo
117    ) -> dict:
118        context = info.context or {}
119        context["filename_prefix"] = "content"
120        return content.model_dump(mode="json", context=context)

!!! abstract "Usage Documentation" Models

A base class for creating Pydantic models.

Attributes: __class_vars__: The names of the class variables defined on the model. __private_attributes__: Metadata about the private attributes of the model. __signature__: The synthesized __init__ [Signature][inspect.Signature] of the model.

__pydantic_complete__: Whether model building is completed, or if there are still undefined fields.
__pydantic_core_schema__: The core schema of the model.
__pydantic_custom_init__: Whether the model has a custom `__init__` function.
__pydantic_decorators__: Metadata containing the decorators defined on the model.
    This replaces `Model.__validators__` and `Model.__root_validators__` from Pydantic V1.
__pydantic_generic_metadata__: Metadata for generic models; contains data used for a similar purpose to
    __args__, __origin__, __parameters__ in typing-module generics. May eventually be replaced by these.
__pydantic_parent_namespace__: Parent namespace of the model, used for automatic rebuilding of models.
__pydantic_post_init__: The name of the post-init method for the model, if defined.
__pydantic_root_model__: Whether the model is a [`RootModel`][pydantic.root_model.RootModel].
__pydantic_serializer__: The `pydantic-core` `SchemaSerializer` used to dump instances of the model.
__pydantic_validator__: The `pydantic-core` `SchemaValidator` used to validate instances of the model.

__pydantic_fields__: A dictionary of field names and their corresponding [`FieldInfo`][pydantic.fields.FieldInfo] objects.
__pydantic_computed_fields__: A dictionary of computed field names and their corresponding [`ComputedFieldInfo`][pydantic.fields.ComputedFieldInfo] objects.

__pydantic_extra__: A dictionary containing extra values, if [`extra`][pydantic.config.ConfigDict.extra]
    is set to `'allow'`.
__pydantic_fields_set__: The names of fields explicitly set during instantiation.
__pydantic_private__: Values of private attributes set on the model instance.
content: kiln_ai.datamodel.basemodel.KilnAttachmentModel
@field_serializer('content')
def serialize_content( self, content: kiln_ai.datamodel.basemodel.KilnAttachmentModel, info: pydantic_core.core_schema.SerializationInfo) -> dict:
114    @field_serializer("content")
115    def serialize_content(
116        self, content: KilnAttachmentModel, info: SerializationInfo
117    ) -> dict:
118        context = info.context or {}
119        context["filename_prefix"] = "content"
120        return content.model_dump(mode="json", context=context)
model_config: ClassVar[pydantic.config.ConfigDict] = {}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

class ChunkedDocument(kiln_ai.datamodel.basemodel.KilnParentedModel, kiln_ai.datamodel.basemodel.KilnParentModel):
123class ChunkedDocument(
124    KilnParentedModel, KilnParentModel, parent_of={"chunk_embeddings": ChunkEmbeddings}
125):
126    chunker_config_id: ID_TYPE = Field(
127        description="The ID of the chunker config used to chunk the document.",
128    )
129    chunks: List[Chunk] = Field(description="The chunks of the document.")
130
131    def parent_extraction(self) -> Union["Extraction", None]:
132        if self.parent is None or self.parent.__class__.__name__ != "Extraction":
133            return None
134        return self.parent  # type: ignore
135
136    def chunk_embeddings(self, readonly: bool = False) -> list[ChunkEmbeddings]:
137        return super().chunk_embeddings(readonly=readonly)  # type: ignore
138
139    async def load_chunks_text(self) -> list[str]:
140        """Utility to return a list of text for each chunk, loaded from each chunk's content attachment."""
141        if not self.path:
142            raise ValueError(
143                "Failed to resolve the path of chunk content attachment because the chunk does not have a path."
144            )
145
146        chunks_text: list[str] = []
147        for chunk in self.chunks:
148            full_path = chunk.content.resolve_path(self.path.parent)
149
150            try:
151                chunks_text.append(
152                    await anyio.Path(full_path).read_text(encoding="utf-8")
153                )
154            except Exception as e:
155                raise ValueError(
156                    f"Failed to read chunk content for {full_path}: {e}"
157                ) from e
158
159        return chunks_text

Base model for Kiln models that have a parent-child relationship. This base class is for child models.

This class provides functionality for managing hierarchical relationships between models, including parent reference handling and file system organization.

Attributes: parent (KilnBaseModel): Reference to the parent model instance. Not persisted, just in memory.

chunker_config_id: Optional[str]
chunks: List[Chunk]
def parent_extraction(self) -> Optional[kiln_ai.datamodel.extraction.Extraction]:
131    def parent_extraction(self) -> Union["Extraction", None]:
132        if self.parent is None or self.parent.__class__.__name__ != "Extraction":
133            return None
134        return self.parent  # type: ignore
def chunk_embeddings( self, readonly=False) -> List[kiln_ai.datamodel.embedding.ChunkEmbeddings]:
643        def child_method(self, readonly: bool = False) -> list[child_class]:
644            return child_class.all_children_of_parent_path(self.path, readonly=readonly)

The type of the None singleton.

async def load_chunks_text(self) -> list[str]:
139    async def load_chunks_text(self) -> list[str]:
140        """Utility to return a list of text for each chunk, loaded from each chunk's content attachment."""
141        if not self.path:
142            raise ValueError(
143                "Failed to resolve the path of chunk content attachment because the chunk does not have a path."
144            )
145
146        chunks_text: list[str] = []
147        for chunk in self.chunks:
148            full_path = chunk.content.resolve_path(self.path.parent)
149
150            try:
151                chunks_text.append(
152                    await anyio.Path(full_path).read_text(encoding="utf-8")
153                )
154            except Exception as e:
155                raise ValueError(
156                    f"Failed to read chunk content for {full_path}: {e}"
157                ) from e
158
159        return chunks_text

Utility to return a list of text for each chunk, loaded from each chunk's content attachment.

def relationship_name() -> str:
661        def relationship_name_method() -> str:
662            return relationship_name

The type of the None singleton.

def parent_type() -> Type[kiln_ai.datamodel.basemodel.KilnParentModel]:
654        def parent_class_method() -> Type[KilnParentModel]:
655            return cls

The type of the None singleton.

model_config = {'validate_assignment': True}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

def model_post_init(self: pydantic.main.BaseModel, context: Any, /) -> None:
337def init_private_attributes(self: BaseModel, context: Any, /) -> None:
338    """This function is meant to behave like a BaseModel method to initialise private attributes.
339
340    It takes context as an argument since that's what pydantic-core passes when calling it.
341
342    Args:
343        self: The BaseModel instance.
344        context: The context.
345    """
346    if getattr(self, '__pydantic_private__', None) is None:
347        pydantic_private = {}
348        for name, private_attr in self.__private_attributes__.items():
349            default = private_attr.get_default()
350            if default is not PydanticUndefined:
351                pydantic_private[name] = default
352        object_setattr(self, '__pydantic_private__', pydantic_private)

This function is meant to behave like a BaseModel method to initialise private attributes.

It takes context as an argument since that's what pydantic-core passes when calling it.

Args: self: The BaseModel instance. context: The context.