kiln_ai.adapters.chunkers.base_chunker
1import logging 2from abc import ABC, abstractmethod 3 4from pydantic import BaseModel, Field 5 6from kiln_ai.adapters.chunkers.helpers import clean_up_text 7from kiln_ai.datamodel.chunk import ChunkerConfig 8 9logger = logging.getLogger(__name__) 10 11 12class TextChunk(BaseModel): 13 text: str = Field(description="The text of the chunk.") 14 15 16class ChunkingResult(BaseModel): 17 chunks: list[TextChunk] = Field(description="The chunks of the text.") 18 19 20class BaseChunker(ABC): 21 """ 22 Base class for all chunkers. 23 24 Should be subclassed by each chunker. 25 """ 26 27 def __init__(self, chunker_config: ChunkerConfig): 28 self.chunker_config = chunker_config 29 30 async def chunk(self, text: str) -> ChunkingResult: 31 if not text: 32 return ChunkingResult(chunks=[]) 33 34 sanitized_text = clean_up_text(text) 35 if not sanitized_text: 36 return ChunkingResult(chunks=[]) 37 38 return await self._chunk(sanitized_text) 39 40 @abstractmethod 41 async def _chunk(self, text: str) -> ChunkingResult: 42 pass
logger =
<Logger kiln_ai.adapters.chunkers.base_chunker (WARNING)>
class
TextChunk(pydantic.main.BaseModel):
!!! abstract "Usage Documentation" Models
A base class for creating Pydantic models.
Attributes:
__class_vars__: The names of the class variables defined on the model.
__private_attributes__: Metadata about the private attributes of the model.
__signature__: The synthesized __init__
[Signature
][inspect.Signature] of the model.
__pydantic_complete__: Whether model building is completed, or if there are still undefined fields.
__pydantic_core_schema__: The core schema of the model.
__pydantic_custom_init__: Whether the model has a custom `__init__` function.
__pydantic_decorators__: Metadata containing the decorators defined on the model.
This replaces `Model.__validators__` and `Model.__root_validators__` from Pydantic V1.
__pydantic_generic_metadata__: Metadata for generic models; contains data used for a similar purpose to
__args__, __origin__, __parameters__ in typing-module generics. May eventually be replaced by these.
__pydantic_parent_namespace__: Parent namespace of the model, used for automatic rebuilding of models.
__pydantic_post_init__: The name of the post-init method for the model, if defined.
__pydantic_root_model__: Whether the model is a [`RootModel`][pydantic.root_model.RootModel].
__pydantic_serializer__: The `pydantic-core` `SchemaSerializer` used to dump instances of the model.
__pydantic_validator__: The `pydantic-core` `SchemaValidator` used to validate instances of the model.
__pydantic_fields__: A dictionary of field names and their corresponding [`FieldInfo`][pydantic.fields.FieldInfo] objects.
__pydantic_computed_fields__: A dictionary of computed field names and their corresponding [`ComputedFieldInfo`][pydantic.fields.ComputedFieldInfo] objects.
__pydantic_extra__: A dictionary containing extra values, if [`extra`][pydantic.config.ConfigDict.extra]
is set to `'allow'`.
__pydantic_fields_set__: The names of fields explicitly set during instantiation.
__pydantic_private__: Values of private attributes set on the model instance.
class
ChunkingResult(pydantic.main.BaseModel):
17class ChunkingResult(BaseModel): 18 chunks: list[TextChunk] = Field(description="The chunks of the text.")
!!! abstract "Usage Documentation" Models
A base class for creating Pydantic models.
Attributes:
__class_vars__: The names of the class variables defined on the model.
__private_attributes__: Metadata about the private attributes of the model.
__signature__: The synthesized __init__
[Signature
][inspect.Signature] of the model.
__pydantic_complete__: Whether model building is completed, or if there are still undefined fields.
__pydantic_core_schema__: The core schema of the model.
__pydantic_custom_init__: Whether the model has a custom `__init__` function.
__pydantic_decorators__: Metadata containing the decorators defined on the model.
This replaces `Model.__validators__` and `Model.__root_validators__` from Pydantic V1.
__pydantic_generic_metadata__: Metadata for generic models; contains data used for a similar purpose to
__args__, __origin__, __parameters__ in typing-module generics. May eventually be replaced by these.
__pydantic_parent_namespace__: Parent namespace of the model, used for automatic rebuilding of models.
__pydantic_post_init__: The name of the post-init method for the model, if defined.
__pydantic_root_model__: Whether the model is a [`RootModel`][pydantic.root_model.RootModel].
__pydantic_serializer__: The `pydantic-core` `SchemaSerializer` used to dump instances of the model.
__pydantic_validator__: The `pydantic-core` `SchemaValidator` used to validate instances of the model.
__pydantic_fields__: A dictionary of field names and their corresponding [`FieldInfo`][pydantic.fields.FieldInfo] objects.
__pydantic_computed_fields__: A dictionary of computed field names and their corresponding [`ComputedFieldInfo`][pydantic.fields.ComputedFieldInfo] objects.
__pydantic_extra__: A dictionary containing extra values, if [`extra`][pydantic.config.ConfigDict.extra]
is set to `'allow'`.
__pydantic_fields_set__: The names of fields explicitly set during instantiation.
__pydantic_private__: Values of private attributes set on the model instance.
chunks: list[TextChunk]
class
BaseChunker(abc.ABC):
21class BaseChunker(ABC): 22 """ 23 Base class for all chunkers. 24 25 Should be subclassed by each chunker. 26 """ 27 28 def __init__(self, chunker_config: ChunkerConfig): 29 self.chunker_config = chunker_config 30 31 async def chunk(self, text: str) -> ChunkingResult: 32 if not text: 33 return ChunkingResult(chunks=[]) 34 35 sanitized_text = clean_up_text(text) 36 if not sanitized_text: 37 return ChunkingResult(chunks=[]) 38 39 return await self._chunk(sanitized_text) 40 41 @abstractmethod 42 async def _chunk(self, text: str) -> ChunkingResult: 43 pass
Base class for all chunkers.
Should be subclassed by each chunker.