kiln_ai.adapters.chunkers.fixed_window_chunker

 1from typing import List
 2
 3from llama_index.core.text_splitter import SentenceSplitter
 4
 5from kiln_ai.adapters.chunkers.base_chunker import (
 6    BaseChunker,
 7    ChunkingResult,
 8    TextChunk,
 9)
10from kiln_ai.datamodel.chunk import ChunkerConfig, ChunkerType
11
12
13class FixedWindowChunker(BaseChunker):
14    def __init__(self, chunker_config: ChunkerConfig):
15        if chunker_config.chunker_type != ChunkerType.FIXED_WINDOW:
16            raise ValueError("Chunker type must be FIXED_WINDOW")
17
18        chunk_size = chunker_config.chunk_size()
19        if chunk_size is None:
20            raise ValueError("Chunk size must be set")
21
22        chunk_overlap = chunker_config.chunk_overlap()
23        if chunk_overlap is None:
24            raise ValueError("Chunk overlap must be set")
25
26        super().__init__(chunker_config)
27        self.splitter = SentenceSplitter(
28            chunk_size=chunk_size,
29            chunk_overlap=chunk_overlap,
30        )
31
32    async def _chunk(self, text: str) -> ChunkingResult:
33        sentences = self.splitter.split_text(text)
34
35        chunks: List[TextChunk] = []
36        for sentence in sentences:
37            chunks.append(TextChunk(text=sentence))
38
39        return ChunkingResult(chunks=chunks)
class FixedWindowChunker(kiln_ai.adapters.chunkers.base_chunker.BaseChunker):
14class FixedWindowChunker(BaseChunker):
15    def __init__(self, chunker_config: ChunkerConfig):
16        if chunker_config.chunker_type != ChunkerType.FIXED_WINDOW:
17            raise ValueError("Chunker type must be FIXED_WINDOW")
18
19        chunk_size = chunker_config.chunk_size()
20        if chunk_size is None:
21            raise ValueError("Chunk size must be set")
22
23        chunk_overlap = chunker_config.chunk_overlap()
24        if chunk_overlap is None:
25            raise ValueError("Chunk overlap must be set")
26
27        super().__init__(chunker_config)
28        self.splitter = SentenceSplitter(
29            chunk_size=chunk_size,
30            chunk_overlap=chunk_overlap,
31        )
32
33    async def _chunk(self, text: str) -> ChunkingResult:
34        sentences = self.splitter.split_text(text)
35
36        chunks: List[TextChunk] = []
37        for sentence in sentences:
38            chunks.append(TextChunk(text=sentence))
39
40        return ChunkingResult(chunks=chunks)

Base class for all chunkers.

Should be subclassed by each chunker.

FixedWindowChunker(chunker_config: kiln_ai.datamodel.chunk.ChunkerConfig)
15    def __init__(self, chunker_config: ChunkerConfig):
16        if chunker_config.chunker_type != ChunkerType.FIXED_WINDOW:
17            raise ValueError("Chunker type must be FIXED_WINDOW")
18
19        chunk_size = chunker_config.chunk_size()
20        if chunk_size is None:
21            raise ValueError("Chunk size must be set")
22
23        chunk_overlap = chunker_config.chunk_overlap()
24        if chunk_overlap is None:
25            raise ValueError("Chunk overlap must be set")
26
27        super().__init__(chunker_config)
28        self.splitter = SentenceSplitter(
29            chunk_size=chunk_size,
30            chunk_overlap=chunk_overlap,
31        )
splitter