kiln_ai.adapters.chunkers.semantic_chunker
1from typing import List 2 3from llama_index.core.embeddings import BaseEmbedding 4from llama_index.core.node_parser import SemanticSplitterNodeParser 5from llama_index.core.schema import Document 6 7from kiln_ai.adapters.chunkers.base_chunker import ( 8 BaseChunker, 9 ChunkingResult, 10 TextChunk, 11) 12from kiln_ai.adapters.chunkers.embedding_wrapper import KilnEmbeddingWrapper 13from kiln_ai.adapters.embedding.embedding_registry import embedding_adapter_from_type 14from kiln_ai.datamodel.chunk import ChunkerConfig, ChunkerType 15from kiln_ai.datamodel.embedding import EmbeddingConfig 16 17 18class SemanticChunker(BaseChunker): 19 """Semantic chunker that groups semantically related sentences together.""" 20 21 def __init__(self, chunker_config: ChunkerConfig): 22 if chunker_config.chunker_type != ChunkerType.SEMANTIC: 23 raise ValueError("Chunker type must be SEMANTIC") 24 25 super().__init__(chunker_config) 26 27 self.embed_model = self._build_embedding_model(chunker_config) 28 self.properties = chunker_config.semantic_properties 29 30 self.semantic_splitter = SemanticSplitterNodeParser( 31 embed_model=self.embed_model, 32 buffer_size=self.properties["buffer_size"], 33 breakpoint_percentile_threshold=self.properties[ 34 "breakpoint_percentile_threshold" 35 ], 36 include_metadata=self.properties["include_metadata"], 37 include_prev_next_rel=self.properties["include_prev_next_rel"], 38 ) 39 40 def _build_embedding_model(self, chunker_config: ChunkerConfig) -> BaseEmbedding: 41 properties = chunker_config.semantic_properties 42 embedding_config_id = properties["embedding_config_id"] 43 if embedding_config_id is None: 44 raise ValueError("embedding_config_id must be set for semantic chunker") 45 46 parent_project = chunker_config.parent_project() 47 if parent_project is None or parent_project.path is None: 48 raise ValueError("SemanticChunker requires parent project") 49 50 embedding_config = EmbeddingConfig.from_id_and_parent_path( 51 embedding_config_id, parent_project.path 52 ) 53 if embedding_config is None: 54 raise ValueError(f"Embedding config not found for id {embedding_config_id}") 55 56 embedding_adapter = embedding_adapter_from_type(embedding_config) 57 return KilnEmbeddingWrapper(embedding_adapter) 58 59 async def _chunk(self, text: str) -> ChunkingResult: 60 document = Document(text=text) 61 62 nodes = await self.semantic_splitter.abuild_semantic_nodes_from_documents( 63 [document], 64 ) 65 66 chunks: List[TextChunk] = [] 67 for node in nodes: 68 text_content = node.get_content() 69 chunks.append(TextChunk(text=text_content)) 70 71 return ChunkingResult(chunks=chunks)
19class SemanticChunker(BaseChunker): 20 """Semantic chunker that groups semantically related sentences together.""" 21 22 def __init__(self, chunker_config: ChunkerConfig): 23 if chunker_config.chunker_type != ChunkerType.SEMANTIC: 24 raise ValueError("Chunker type must be SEMANTIC") 25 26 super().__init__(chunker_config) 27 28 self.embed_model = self._build_embedding_model(chunker_config) 29 self.properties = chunker_config.semantic_properties 30 31 self.semantic_splitter = SemanticSplitterNodeParser( 32 embed_model=self.embed_model, 33 buffer_size=self.properties["buffer_size"], 34 breakpoint_percentile_threshold=self.properties[ 35 "breakpoint_percentile_threshold" 36 ], 37 include_metadata=self.properties["include_metadata"], 38 include_prev_next_rel=self.properties["include_prev_next_rel"], 39 ) 40 41 def _build_embedding_model(self, chunker_config: ChunkerConfig) -> BaseEmbedding: 42 properties = chunker_config.semantic_properties 43 embedding_config_id = properties["embedding_config_id"] 44 if embedding_config_id is None: 45 raise ValueError("embedding_config_id must be set for semantic chunker") 46 47 parent_project = chunker_config.parent_project() 48 if parent_project is None or parent_project.path is None: 49 raise ValueError("SemanticChunker requires parent project") 50 51 embedding_config = EmbeddingConfig.from_id_and_parent_path( 52 embedding_config_id, parent_project.path 53 ) 54 if embedding_config is None: 55 raise ValueError(f"Embedding config not found for id {embedding_config_id}") 56 57 embedding_adapter = embedding_adapter_from_type(embedding_config) 58 return KilnEmbeddingWrapper(embedding_adapter) 59 60 async def _chunk(self, text: str) -> ChunkingResult: 61 document = Document(text=text) 62 63 nodes = await self.semantic_splitter.abuild_semantic_nodes_from_documents( 64 [document], 65 ) 66 67 chunks: List[TextChunk] = [] 68 for node in nodes: 69 text_content = node.get_content() 70 chunks.append(TextChunk(text=text_content)) 71 72 return ChunkingResult(chunks=chunks)
Semantic chunker that groups semantically related sentences together.
SemanticChunker(chunker_config: kiln_ai.datamodel.chunk.ChunkerConfig)
22 def __init__(self, chunker_config: ChunkerConfig): 23 if chunker_config.chunker_type != ChunkerType.SEMANTIC: 24 raise ValueError("Chunker type must be SEMANTIC") 25 26 super().__init__(chunker_config) 27 28 self.embed_model = self._build_embedding_model(chunker_config) 29 self.properties = chunker_config.semantic_properties 30 31 self.semantic_splitter = SemanticSplitterNodeParser( 32 embed_model=self.embed_model, 33 buffer_size=self.properties["buffer_size"], 34 breakpoint_percentile_threshold=self.properties[ 35 "breakpoint_percentile_threshold" 36 ], 37 include_metadata=self.properties["include_metadata"], 38 include_prev_next_rel=self.properties["include_prev_next_rel"], 39 )