kiln_ai.adapters.eval.base_eval
1import json 2from abc import abstractmethod 3from typing import Dict 4 5from kiln_ai.adapters.adapter_registry import adapter_for_task 6from kiln_ai.adapters.ml_model_list import ModelProviderName 7from kiln_ai.adapters.model_adapters.base_adapter import AdapterConfig 8from kiln_ai.datamodel.eval import Eval, EvalConfig, EvalScores 9from kiln_ai.datamodel.json_schema import validate_schema 10from kiln_ai.datamodel.task import RunConfig, TaskOutputRatingType, TaskRun 11from kiln_ai.utils.exhaustive_error import raise_exhaustive_enum_error 12 13 14class BaseEval: 15 """ 16 Base class for all evals/evaluators. 17 18 Should be subclassed, and the run_eval method implemented. 19 """ 20 21 def __init__(self, eval_config: EvalConfig, run_config: RunConfig | None): 22 self.eval_config = eval_config 23 eval = eval_config.parent_eval() 24 if not eval: 25 raise ValueError("Eval config must have a parent eval") 26 self.eval = eval 27 task = self.eval.parent_task() 28 if not task: 29 raise ValueError("Eval must have a parent task") 30 self.target_task = task 31 self.score_schema = BaseEval.build_score_schema(eval, allow_float_scores=True) 32 self.run_config = run_config 33 34 def model_and_provider(self) -> tuple[str, ModelProviderName]: 35 model_name = self.eval_config.model_name 36 provider = self.eval_config.model_provider 37 if ( 38 not model_name 39 or not provider 40 or not isinstance(model_name, str) 41 or not isinstance(provider, str) 42 or provider not in ModelProviderName.__members__ 43 ): 44 raise ValueError( 45 "Model name and provider must be set in the eval config model properties" 46 ) 47 48 return model_name, ModelProviderName(provider) 49 50 async def run_task_and_eval( 51 self, input: str 52 ) -> tuple[TaskRun, EvalScores, Dict[str, str] | None]: 53 """ 54 Runs the task on the provided run_config to generate fresh output, then runs the eval on that output. 55 """ 56 if self.run_config is None: 57 raise ValueError("Run config is required for run_task_and_eval") 58 59 run_adapter = adapter_for_task( 60 self.target_task, 61 self.run_config.model_name, 62 ModelProviderName(self.run_config.model_provider_name), 63 base_adapter_config=AdapterConfig(allow_saving=False), 64 ) 65 66 # Parse structured input if needed 67 parsed_input = input 68 if self.target_task.output_json_schema is not None: 69 parsed_input = json.loads(input) 70 71 # we don't save by default here. We'll save manually after validating the output 72 run_output = await run_adapter.invoke(parsed_input) 73 74 eval_output, intermediate_outputs = await self.run_eval(run_output) 75 validate_schema(eval_output, self.score_schema) 76 77 return run_output, eval_output, intermediate_outputs 78 79 @abstractmethod 80 async def run_eval( 81 self, task_run: TaskRun 82 ) -> tuple[EvalScores, Dict[str, str] | None]: 83 """ 84 Runs the eval on the given task run. 85 86 Returns a dictionary of scores which should conform to the score schema, and a dictionary of intermediate outputs (eval thinking). 87 """ 88 pass 89 90 @classmethod 91 def build_score_schema(cls, eval: Eval, allow_float_scores: bool = False) -> str: 92 """ 93 Build a JSON schema for the scoring output of the task requirements 94 95 We allow 2 modes: allow_float_scores=True and allow_float_scores=False. 96 97 allow_float_scores=False is used for the call to the model, and forces the model into selecting into discrete rating options (int 1-5, pass-fail, etc). 98 allow_float_scores=True is used for final score output (for example, after we take a g-eval weighting of the model's logprobs). A pass/fail rating might return 0.75 for likely pass (as opposed to 0.99 for near certain pass), or a 1-5 score might return 3.75. 99 """ 100 101 # Note: python maintains order, which is good as we want the user defined order, and overall last 102 properties = {} 103 for output_score in eval.output_scores: 104 output_score_json_key = output_score.json_key() 105 106 if len(output_score_json_key) == 0: 107 raise ValueError( 108 f"Invalid output score name: {output_score.name}. Can not be used as JSON schema key." 109 ) 110 property: dict[str, str | int | float | list[str] | list[int]] = { 111 "title": output_score.name, 112 } 113 match output_score.type: 114 case TaskOutputRatingType.five_star: 115 if allow_float_scores: 116 property["type"] = "number" 117 property["minimum"] = 1 118 property["maximum"] = 5 119 else: 120 property["enum"] = [1, 2, 3, 4, 5] 121 122 property["description"] = ( 123 f"{output_score.instruction}\n\nThe rating should be between 1 and 5, with 1 being the worst and 5 being the best." 124 ) 125 case TaskOutputRatingType.pass_fail: 126 if allow_float_scores: 127 property["type"] = "number" 128 property["minimum"] = 0 129 property["maximum"] = 1 130 property["description"] = ( 131 f"{output_score.instruction}\n\nThe rating should be between 0 and 1, with 0 being a failure and 1 being a pass." 132 ) 133 else: 134 property["enum"] = ["pass", "fail"] 135 property["description"] = ( 136 f"{output_score.instruction}\n\nThe rating should be either 'pass' or 'fail'." 137 ) 138 case TaskOutputRatingType.pass_fail_critical: 139 if allow_float_scores: 140 property["type"] = "number" 141 property["minimum"] = -1 142 property["maximum"] = 1 143 property["description"] = ( 144 f"{output_score.instruction}\n\nThe rating should be between -1 and 1, with 1 being a pass, 0 being a failure, and -1 being a critical failure (very severe failure)." 145 ) 146 else: 147 property["enum"] = ["pass", "fail", "critical"] 148 property["description"] = ( 149 f"{output_score.instruction}\n\nThe rating should be either 'pass', 'fail', or 'critical' where critical a very severe failure." 150 ) 151 case TaskOutputRatingType.custom: 152 # Skip custom rating types in evals 153 continue 154 case _: 155 raise_exhaustive_enum_error(output_score.type) 156 157 properties[output_score_json_key] = property 158 159 schema = { 160 "type": "object", 161 "properties": properties, 162 "required": list(properties.keys()), 163 } 164 return json.dumps(schema, ensure_ascii=False)
15class BaseEval: 16 """ 17 Base class for all evals/evaluators. 18 19 Should be subclassed, and the run_eval method implemented. 20 """ 21 22 def __init__(self, eval_config: EvalConfig, run_config: RunConfig | None): 23 self.eval_config = eval_config 24 eval = eval_config.parent_eval() 25 if not eval: 26 raise ValueError("Eval config must have a parent eval") 27 self.eval = eval 28 task = self.eval.parent_task() 29 if not task: 30 raise ValueError("Eval must have a parent task") 31 self.target_task = task 32 self.score_schema = BaseEval.build_score_schema(eval, allow_float_scores=True) 33 self.run_config = run_config 34 35 def model_and_provider(self) -> tuple[str, ModelProviderName]: 36 model_name = self.eval_config.model_name 37 provider = self.eval_config.model_provider 38 if ( 39 not model_name 40 or not provider 41 or not isinstance(model_name, str) 42 or not isinstance(provider, str) 43 or provider not in ModelProviderName.__members__ 44 ): 45 raise ValueError( 46 "Model name and provider must be set in the eval config model properties" 47 ) 48 49 return model_name, ModelProviderName(provider) 50 51 async def run_task_and_eval( 52 self, input: str 53 ) -> tuple[TaskRun, EvalScores, Dict[str, str] | None]: 54 """ 55 Runs the task on the provided run_config to generate fresh output, then runs the eval on that output. 56 """ 57 if self.run_config is None: 58 raise ValueError("Run config is required for run_task_and_eval") 59 60 run_adapter = adapter_for_task( 61 self.target_task, 62 self.run_config.model_name, 63 ModelProviderName(self.run_config.model_provider_name), 64 base_adapter_config=AdapterConfig(allow_saving=False), 65 ) 66 67 # Parse structured input if needed 68 parsed_input = input 69 if self.target_task.output_json_schema is not None: 70 parsed_input = json.loads(input) 71 72 # we don't save by default here. We'll save manually after validating the output 73 run_output = await run_adapter.invoke(parsed_input) 74 75 eval_output, intermediate_outputs = await self.run_eval(run_output) 76 validate_schema(eval_output, self.score_schema) 77 78 return run_output, eval_output, intermediate_outputs 79 80 @abstractmethod 81 async def run_eval( 82 self, task_run: TaskRun 83 ) -> tuple[EvalScores, Dict[str, str] | None]: 84 """ 85 Runs the eval on the given task run. 86 87 Returns a dictionary of scores which should conform to the score schema, and a dictionary of intermediate outputs (eval thinking). 88 """ 89 pass 90 91 @classmethod 92 def build_score_schema(cls, eval: Eval, allow_float_scores: bool = False) -> str: 93 """ 94 Build a JSON schema for the scoring output of the task requirements 95 96 We allow 2 modes: allow_float_scores=True and allow_float_scores=False. 97 98 allow_float_scores=False is used for the call to the model, and forces the model into selecting into discrete rating options (int 1-5, pass-fail, etc). 99 allow_float_scores=True is used for final score output (for example, after we take a g-eval weighting of the model's logprobs). A pass/fail rating might return 0.75 for likely pass (as opposed to 0.99 for near certain pass), or a 1-5 score might return 3.75. 100 """ 101 102 # Note: python maintains order, which is good as we want the user defined order, and overall last 103 properties = {} 104 for output_score in eval.output_scores: 105 output_score_json_key = output_score.json_key() 106 107 if len(output_score_json_key) == 0: 108 raise ValueError( 109 f"Invalid output score name: {output_score.name}. Can not be used as JSON schema key." 110 ) 111 property: dict[str, str | int | float | list[str] | list[int]] = { 112 "title": output_score.name, 113 } 114 match output_score.type: 115 case TaskOutputRatingType.five_star: 116 if allow_float_scores: 117 property["type"] = "number" 118 property["minimum"] = 1 119 property["maximum"] = 5 120 else: 121 property["enum"] = [1, 2, 3, 4, 5] 122 123 property["description"] = ( 124 f"{output_score.instruction}\n\nThe rating should be between 1 and 5, with 1 being the worst and 5 being the best." 125 ) 126 case TaskOutputRatingType.pass_fail: 127 if allow_float_scores: 128 property["type"] = "number" 129 property["minimum"] = 0 130 property["maximum"] = 1 131 property["description"] = ( 132 f"{output_score.instruction}\n\nThe rating should be between 0 and 1, with 0 being a failure and 1 being a pass." 133 ) 134 else: 135 property["enum"] = ["pass", "fail"] 136 property["description"] = ( 137 f"{output_score.instruction}\n\nThe rating should be either 'pass' or 'fail'." 138 ) 139 case TaskOutputRatingType.pass_fail_critical: 140 if allow_float_scores: 141 property["type"] = "number" 142 property["minimum"] = -1 143 property["maximum"] = 1 144 property["description"] = ( 145 f"{output_score.instruction}\n\nThe rating should be between -1 and 1, with 1 being a pass, 0 being a failure, and -1 being a critical failure (very severe failure)." 146 ) 147 else: 148 property["enum"] = ["pass", "fail", "critical"] 149 property["description"] = ( 150 f"{output_score.instruction}\n\nThe rating should be either 'pass', 'fail', or 'critical' where critical a very severe failure." 151 ) 152 case TaskOutputRatingType.custom: 153 # Skip custom rating types in evals 154 continue 155 case _: 156 raise_exhaustive_enum_error(output_score.type) 157 158 properties[output_score_json_key] = property 159 160 schema = { 161 "type": "object", 162 "properties": properties, 163 "required": list(properties.keys()), 164 } 165 return json.dumps(schema, ensure_ascii=False)
Base class for all evals/evaluators.
Should be subclassed, and the run_eval method implemented.
22 def __init__(self, eval_config: EvalConfig, run_config: RunConfig | None): 23 self.eval_config = eval_config 24 eval = eval_config.parent_eval() 25 if not eval: 26 raise ValueError("Eval config must have a parent eval") 27 self.eval = eval 28 task = self.eval.parent_task() 29 if not task: 30 raise ValueError("Eval must have a parent task") 31 self.target_task = task 32 self.score_schema = BaseEval.build_score_schema(eval, allow_float_scores=True) 33 self.run_config = run_config
35 def model_and_provider(self) -> tuple[str, ModelProviderName]: 36 model_name = self.eval_config.model_name 37 provider = self.eval_config.model_provider 38 if ( 39 not model_name 40 or not provider 41 or not isinstance(model_name, str) 42 or not isinstance(provider, str) 43 or provider not in ModelProviderName.__members__ 44 ): 45 raise ValueError( 46 "Model name and provider must be set in the eval config model properties" 47 ) 48 49 return model_name, ModelProviderName(provider)
51 async def run_task_and_eval( 52 self, input: str 53 ) -> tuple[TaskRun, EvalScores, Dict[str, str] | None]: 54 """ 55 Runs the task on the provided run_config to generate fresh output, then runs the eval on that output. 56 """ 57 if self.run_config is None: 58 raise ValueError("Run config is required for run_task_and_eval") 59 60 run_adapter = adapter_for_task( 61 self.target_task, 62 self.run_config.model_name, 63 ModelProviderName(self.run_config.model_provider_name), 64 base_adapter_config=AdapterConfig(allow_saving=False), 65 ) 66 67 # Parse structured input if needed 68 parsed_input = input 69 if self.target_task.output_json_schema is not None: 70 parsed_input = json.loads(input) 71 72 # we don't save by default here. We'll save manually after validating the output 73 run_output = await run_adapter.invoke(parsed_input) 74 75 eval_output, intermediate_outputs = await self.run_eval(run_output) 76 validate_schema(eval_output, self.score_schema) 77 78 return run_output, eval_output, intermediate_outputs
Runs the task on the provided run_config to generate fresh output, then runs the eval on that output.
80 @abstractmethod 81 async def run_eval( 82 self, task_run: TaskRun 83 ) -> tuple[EvalScores, Dict[str, str] | None]: 84 """ 85 Runs the eval on the given task run. 86 87 Returns a dictionary of scores which should conform to the score schema, and a dictionary of intermediate outputs (eval thinking). 88 """ 89 pass
Runs the eval on the given task run.
Returns a dictionary of scores which should conform to the score schema, and a dictionary of intermediate outputs (eval thinking).
91 @classmethod 92 def build_score_schema(cls, eval: Eval, allow_float_scores: bool = False) -> str: 93 """ 94 Build a JSON schema for the scoring output of the task requirements 95 96 We allow 2 modes: allow_float_scores=True and allow_float_scores=False. 97 98 allow_float_scores=False is used for the call to the model, and forces the model into selecting into discrete rating options (int 1-5, pass-fail, etc). 99 allow_float_scores=True is used for final score output (for example, after we take a g-eval weighting of the model's logprobs). A pass/fail rating might return 0.75 for likely pass (as opposed to 0.99 for near certain pass), or a 1-5 score might return 3.75. 100 """ 101 102 # Note: python maintains order, which is good as we want the user defined order, and overall last 103 properties = {} 104 for output_score in eval.output_scores: 105 output_score_json_key = output_score.json_key() 106 107 if len(output_score_json_key) == 0: 108 raise ValueError( 109 f"Invalid output score name: {output_score.name}. Can not be used as JSON schema key." 110 ) 111 property: dict[str, str | int | float | list[str] | list[int]] = { 112 "title": output_score.name, 113 } 114 match output_score.type: 115 case TaskOutputRatingType.five_star: 116 if allow_float_scores: 117 property["type"] = "number" 118 property["minimum"] = 1 119 property["maximum"] = 5 120 else: 121 property["enum"] = [1, 2, 3, 4, 5] 122 123 property["description"] = ( 124 f"{output_score.instruction}\n\nThe rating should be between 1 and 5, with 1 being the worst and 5 being the best." 125 ) 126 case TaskOutputRatingType.pass_fail: 127 if allow_float_scores: 128 property["type"] = "number" 129 property["minimum"] = 0 130 property["maximum"] = 1 131 property["description"] = ( 132 f"{output_score.instruction}\n\nThe rating should be between 0 and 1, with 0 being a failure and 1 being a pass." 133 ) 134 else: 135 property["enum"] = ["pass", "fail"] 136 property["description"] = ( 137 f"{output_score.instruction}\n\nThe rating should be either 'pass' or 'fail'." 138 ) 139 case TaskOutputRatingType.pass_fail_critical: 140 if allow_float_scores: 141 property["type"] = "number" 142 property["minimum"] = -1 143 property["maximum"] = 1 144 property["description"] = ( 145 f"{output_score.instruction}\n\nThe rating should be between -1 and 1, with 1 being a pass, 0 being a failure, and -1 being a critical failure (very severe failure)." 146 ) 147 else: 148 property["enum"] = ["pass", "fail", "critical"] 149 property["description"] = ( 150 f"{output_score.instruction}\n\nThe rating should be either 'pass', 'fail', or 'critical' where critical a very severe failure." 151 ) 152 case TaskOutputRatingType.custom: 153 # Skip custom rating types in evals 154 continue 155 case _: 156 raise_exhaustive_enum_error(output_score.type) 157 158 properties[output_score_json_key] = property 159 160 schema = { 161 "type": "object", 162 "properties": properties, 163 "required": list(properties.keys()), 164 } 165 return json.dumps(schema, ensure_ascii=False)
Build a JSON schema for the scoring output of the task requirements
We allow 2 modes: allow_float_scores=True and allow_float_scores=False.
allow_float_scores=False is used for the call to the model, and forces the model into selecting into discrete rating options (int 1-5, pass-fail, etc). allow_float_scores=True is used for final score output (for example, after we take a g-eval weighting of the model's logprobs). A pass/fail rating might return 0.75 for likely pass (as opposed to 0.99 for near certain pass), or a 1-5 score might return 3.75.