kiln_ai.adapters.chunkers.fixed_window_chunker
1from typing import List 2 3from llama_index.core.text_splitter import SentenceSplitter 4 5from kiln_ai.adapters.chunkers.base_chunker import ( 6 BaseChunker, 7 ChunkingResult, 8 TextChunk, 9) 10from kiln_ai.datamodel.chunk import ChunkerConfig, ChunkerType 11 12 13class FixedWindowChunker(BaseChunker): 14 def __init__(self, chunker_config: ChunkerConfig): 15 if chunker_config.chunker_type != ChunkerType.FIXED_WINDOW: 16 raise ValueError("Chunker type must be FIXED_WINDOW") 17 18 super().__init__(chunker_config) 19 self.splitter = SentenceSplitter( 20 chunk_size=chunker_config.fixed_window_properties["chunk_size"], 21 chunk_overlap=chunker_config.fixed_window_properties["chunk_overlap"], 22 ) 23 24 async def _chunk(self, text: str) -> ChunkingResult: 25 sentences = self.splitter.split_text(text) 26 27 chunks: List[TextChunk] = [] 28 for sentence in sentences: 29 chunks.append(TextChunk(text=sentence)) 30 31 return ChunkingResult(chunks=chunks)
14class FixedWindowChunker(BaseChunker): 15 def __init__(self, chunker_config: ChunkerConfig): 16 if chunker_config.chunker_type != ChunkerType.FIXED_WINDOW: 17 raise ValueError("Chunker type must be FIXED_WINDOW") 18 19 super().__init__(chunker_config) 20 self.splitter = SentenceSplitter( 21 chunk_size=chunker_config.fixed_window_properties["chunk_size"], 22 chunk_overlap=chunker_config.fixed_window_properties["chunk_overlap"], 23 ) 24 25 async def _chunk(self, text: str) -> ChunkingResult: 26 sentences = self.splitter.split_text(text) 27 28 chunks: List[TextChunk] = [] 29 for sentence in sentences: 30 chunks.append(TextChunk(text=sentence)) 31 32 return ChunkingResult(chunks=chunks)
Base class for all chunkers.
Should be subclassed by each chunker.
FixedWindowChunker(chunker_config: kiln_ai.datamodel.chunk.ChunkerConfig)
15 def __init__(self, chunker_config: ChunkerConfig): 16 if chunker_config.chunker_type != ChunkerType.FIXED_WINDOW: 17 raise ValueError("Chunker type must be FIXED_WINDOW") 18 19 super().__init__(chunker_config) 20 self.splitter = SentenceSplitter( 21 chunk_size=chunker_config.fixed_window_properties["chunk_size"], 22 chunk_overlap=chunker_config.fixed_window_properties["chunk_overlap"], 23 )