kiln_ai.adapters.chunkers.semantic_chunker

 1from typing import List
 2
 3from llama_index.core.embeddings import BaseEmbedding
 4from llama_index.core.node_parser import SemanticSplitterNodeParser
 5from llama_index.core.schema import Document
 6
 7from kiln_ai.adapters.chunkers.base_chunker import (
 8    BaseChunker,
 9    ChunkingResult,
10    TextChunk,
11)
12from kiln_ai.adapters.chunkers.embedding_wrapper import KilnEmbeddingWrapper
13from kiln_ai.adapters.embedding.embedding_registry import embedding_adapter_from_type
14from kiln_ai.datamodel.chunk import ChunkerConfig, ChunkerType
15from kiln_ai.datamodel.embedding import EmbeddingConfig
16
17
18class SemanticChunker(BaseChunker):
19    """Semantic chunker that groups semantically related sentences together."""
20
21    def __init__(self, chunker_config: ChunkerConfig):
22        if chunker_config.chunker_type != ChunkerType.SEMANTIC:
23            raise ValueError("Chunker type must be SEMANTIC")
24
25        super().__init__(chunker_config)
26
27        self.embed_model = self._build_embedding_model(chunker_config)
28        self.properties = chunker_config.semantic_properties
29
30        self.semantic_splitter = SemanticSplitterNodeParser(
31            embed_model=self.embed_model,
32            buffer_size=self.properties["buffer_size"],
33            breakpoint_percentile_threshold=self.properties[
34                "breakpoint_percentile_threshold"
35            ],
36            include_metadata=self.properties["include_metadata"],
37            include_prev_next_rel=self.properties["include_prev_next_rel"],
38        )
39
40    def _build_embedding_model(self, chunker_config: ChunkerConfig) -> BaseEmbedding:
41        properties = chunker_config.semantic_properties
42        embedding_config_id = properties["embedding_config_id"]
43        if embedding_config_id is None:
44            raise ValueError("embedding_config_id must be set for semantic chunker")
45
46        parent_project = chunker_config.parent_project()
47        if parent_project is None or parent_project.path is None:
48            raise ValueError("SemanticChunker requires parent project")
49
50        embedding_config = EmbeddingConfig.from_id_and_parent_path(
51            embedding_config_id, parent_project.path
52        )
53        if embedding_config is None:
54            raise ValueError(f"Embedding config not found for id {embedding_config_id}")
55
56        embedding_adapter = embedding_adapter_from_type(embedding_config)
57        return KilnEmbeddingWrapper(embedding_adapter)
58
59    async def _chunk(self, text: str) -> ChunkingResult:
60        document = Document(text=text)
61
62        nodes = await self.semantic_splitter.abuild_semantic_nodes_from_documents(
63            [document],
64        )
65
66        chunks: List[TextChunk] = []
67        for node in nodes:
68            text_content = node.get_content()
69            chunks.append(TextChunk(text=text_content))
70
71        return ChunkingResult(chunks=chunks)
class SemanticChunker(kiln_ai.adapters.chunkers.base_chunker.BaseChunker):
19class SemanticChunker(BaseChunker):
20    """Semantic chunker that groups semantically related sentences together."""
21
22    def __init__(self, chunker_config: ChunkerConfig):
23        if chunker_config.chunker_type != ChunkerType.SEMANTIC:
24            raise ValueError("Chunker type must be SEMANTIC")
25
26        super().__init__(chunker_config)
27
28        self.embed_model = self._build_embedding_model(chunker_config)
29        self.properties = chunker_config.semantic_properties
30
31        self.semantic_splitter = SemanticSplitterNodeParser(
32            embed_model=self.embed_model,
33            buffer_size=self.properties["buffer_size"],
34            breakpoint_percentile_threshold=self.properties[
35                "breakpoint_percentile_threshold"
36            ],
37            include_metadata=self.properties["include_metadata"],
38            include_prev_next_rel=self.properties["include_prev_next_rel"],
39        )
40
41    def _build_embedding_model(self, chunker_config: ChunkerConfig) -> BaseEmbedding:
42        properties = chunker_config.semantic_properties
43        embedding_config_id = properties["embedding_config_id"]
44        if embedding_config_id is None:
45            raise ValueError("embedding_config_id must be set for semantic chunker")
46
47        parent_project = chunker_config.parent_project()
48        if parent_project is None or parent_project.path is None:
49            raise ValueError("SemanticChunker requires parent project")
50
51        embedding_config = EmbeddingConfig.from_id_and_parent_path(
52            embedding_config_id, parent_project.path
53        )
54        if embedding_config is None:
55            raise ValueError(f"Embedding config not found for id {embedding_config_id}")
56
57        embedding_adapter = embedding_adapter_from_type(embedding_config)
58        return KilnEmbeddingWrapper(embedding_adapter)
59
60    async def _chunk(self, text: str) -> ChunkingResult:
61        document = Document(text=text)
62
63        nodes = await self.semantic_splitter.abuild_semantic_nodes_from_documents(
64            [document],
65        )
66
67        chunks: List[TextChunk] = []
68        for node in nodes:
69            text_content = node.get_content()
70            chunks.append(TextChunk(text=text_content))
71
72        return ChunkingResult(chunks=chunks)

Semantic chunker that groups semantically related sentences together.

SemanticChunker(chunker_config: kiln_ai.datamodel.chunk.ChunkerConfig)
22    def __init__(self, chunker_config: ChunkerConfig):
23        if chunker_config.chunker_type != ChunkerType.SEMANTIC:
24            raise ValueError("Chunker type must be SEMANTIC")
25
26        super().__init__(chunker_config)
27
28        self.embed_model = self._build_embedding_model(chunker_config)
29        self.properties = chunker_config.semantic_properties
30
31        self.semantic_splitter = SemanticSplitterNodeParser(
32            embed_model=self.embed_model,
33            buffer_size=self.properties["buffer_size"],
34            breakpoint_percentile_threshold=self.properties[
35                "breakpoint_percentile_threshold"
36            ],
37            include_metadata=self.properties["include_metadata"],
38            include_prev_next_rel=self.properties["include_prev_next_rel"],
39        )
embed_model
properties
semantic_splitter