kiln_ai.adapters.eval.base_eval
1import json 2from abc import abstractmethod 3from typing import Dict 4 5from kiln_ai.adapters.adapter_registry import adapter_for_task 6from kiln_ai.adapters.ml_model_list import ModelProviderName 7from kiln_ai.adapters.model_adapters.base_adapter import AdapterConfig 8from kiln_ai.datamodel.eval import Eval, EvalConfig, EvalScores 9from kiln_ai.datamodel.json_schema import validate_schema_with_value_error 10from kiln_ai.datamodel.task import ( 11 RunConfig, 12 RunConfigProperties, 13 TaskOutputRatingType, 14 TaskRun, 15) 16from kiln_ai.utils.exhaustive_error import raise_exhaustive_enum_error 17 18 19class BaseEval: 20 """ 21 Base class for all evals/evaluators. 22 23 Should be subclassed, and the run_eval method implemented. 24 """ 25 26 def __init__(self, eval_config: EvalConfig, run_config: RunConfig | None): 27 self.eval_config = eval_config 28 eval = eval_config.parent_eval() 29 if not eval: 30 raise ValueError("Eval config must have a parent eval") 31 self.eval = eval 32 task = self.eval.parent_task() 33 if not task: 34 raise ValueError("Eval must have a parent task") 35 self.target_task = task 36 self.score_schema = BaseEval.build_score_schema(eval, allow_float_scores=True) 37 self.run_config = run_config 38 39 def model_and_provider(self) -> tuple[str, ModelProviderName]: 40 model_name = self.eval_config.model_name 41 provider = self.eval_config.model_provider 42 if ( 43 not model_name 44 or not provider 45 or not isinstance(model_name, str) 46 or not isinstance(provider, str) 47 or provider not in ModelProviderName.__members__ 48 ): 49 raise ValueError( 50 "Model name and provider must be set in the eval config model properties" 51 ) 52 53 return model_name, ModelProviderName(provider) 54 55 async def run_task_and_eval( 56 self, input: str 57 ) -> tuple[TaskRun, EvalScores, Dict[str, str] | None]: 58 """ 59 Runs the task on the provided run_config to generate fresh output, then runs the eval on that output. 60 """ 61 if self.run_config is None: 62 raise ValueError("Run config is required for run_task_and_eval") 63 64 run_adapter = adapter_for_task( 65 self.target_task, 66 self.run_config, 67 base_adapter_config=AdapterConfig(allow_saving=False), 68 ) 69 70 # Parse structured input if needed 71 parsed_input = input 72 if self.target_task.input_json_schema is not None: 73 parsed_input = json.loads(input) 74 75 # we don't save by default here. We'll save manually after validating the output 76 run_output = await run_adapter.invoke(parsed_input) 77 78 eval_output, intermediate_outputs = await self.run_eval(run_output) 79 80 validate_schema_with_value_error( 81 eval_output, self.score_schema, "Eval output does not match score schema." 82 ) 83 84 return run_output, eval_output, intermediate_outputs 85 86 @abstractmethod 87 async def run_eval( 88 self, task_run: TaskRun 89 ) -> tuple[EvalScores, Dict[str, str] | None]: 90 """ 91 Runs the eval on the given task run. 92 93 Returns a dictionary of scores which should conform to the score schema, and a dictionary of intermediate outputs (eval thinking). 94 """ 95 pass 96 97 @classmethod 98 def build_score_schema(cls, eval: Eval, allow_float_scores: bool = False) -> str: 99 """ 100 Build a JSON schema for the scoring output of the task requirements 101 102 We allow 2 modes: allow_float_scores=True and allow_float_scores=False. 103 104 allow_float_scores=False is used for the call to the model, and forces the model into selecting into discrete rating options (int 1-5, pass-fail, etc). 105 allow_float_scores=True is used for final score output (for example, after we take a g-eval weighting of the model's logprobs). A pass/fail rating might return 0.75 for likely pass (as opposed to 0.99 for near certain pass), or a 1-5 score might return 3.75. 106 """ 107 108 # Note: python maintains order, which is good as we want the user defined order, and overall last 109 properties = {} 110 for output_score in eval.output_scores: 111 output_score_json_key = output_score.json_key() 112 113 if len(output_score_json_key) == 0: 114 raise ValueError( 115 f"Invalid output score name: {output_score.name}. Can not be used as JSON schema key." 116 ) 117 property: dict[str, str | int | float | list[str] | list[int]] = { 118 "title": output_score.name, 119 } 120 match output_score.type: 121 case TaskOutputRatingType.five_star: 122 if allow_float_scores: 123 property["type"] = "number" 124 property["minimum"] = 1 125 property["maximum"] = 5 126 else: 127 property["enum"] = [1, 2, 3, 4, 5] 128 129 property["description"] = ( 130 f"{output_score.instruction}\n\nThe rating should be between 1 and 5, with 1 being the worst and 5 being the best." 131 ) 132 case TaskOutputRatingType.pass_fail: 133 if allow_float_scores: 134 property["type"] = "number" 135 property["minimum"] = 0 136 property["maximum"] = 1 137 property["description"] = ( 138 f"{output_score.instruction}\n\nThe rating should be between 0 and 1, with 0 being a failure and 1 being a pass." 139 ) 140 else: 141 property["enum"] = ["pass", "fail"] 142 property["description"] = ( 143 f"{output_score.instruction}\n\nThe rating should be either 'pass' or 'fail'." 144 ) 145 case TaskOutputRatingType.pass_fail_critical: 146 if allow_float_scores: 147 property["type"] = "number" 148 property["minimum"] = -1 149 property["maximum"] = 1 150 property["description"] = ( 151 f"{output_score.instruction}\n\nThe rating should be between -1 and 1, with 1 being a pass, 0 being a failure, and -1 being a critical failure (very severe failure)." 152 ) 153 else: 154 property["enum"] = ["pass", "fail", "critical"] 155 property["description"] = ( 156 f"{output_score.instruction}\n\nThe rating should be either 'pass', 'fail', or 'critical' where critical a very severe failure." 157 ) 158 case TaskOutputRatingType.custom: 159 # Skip custom rating types in evals 160 continue 161 case _: 162 raise_exhaustive_enum_error(output_score.type) 163 164 properties[output_score_json_key] = property 165 166 schema = { 167 "type": "object", 168 "properties": properties, 169 "required": list(properties.keys()), 170 } 171 return json.dumps(schema, ensure_ascii=False)
20class BaseEval: 21 """ 22 Base class for all evals/evaluators. 23 24 Should be subclassed, and the run_eval method implemented. 25 """ 26 27 def __init__(self, eval_config: EvalConfig, run_config: RunConfig | None): 28 self.eval_config = eval_config 29 eval = eval_config.parent_eval() 30 if not eval: 31 raise ValueError("Eval config must have a parent eval") 32 self.eval = eval 33 task = self.eval.parent_task() 34 if not task: 35 raise ValueError("Eval must have a parent task") 36 self.target_task = task 37 self.score_schema = BaseEval.build_score_schema(eval, allow_float_scores=True) 38 self.run_config = run_config 39 40 def model_and_provider(self) -> tuple[str, ModelProviderName]: 41 model_name = self.eval_config.model_name 42 provider = self.eval_config.model_provider 43 if ( 44 not model_name 45 or not provider 46 or not isinstance(model_name, str) 47 or not isinstance(provider, str) 48 or provider not in ModelProviderName.__members__ 49 ): 50 raise ValueError( 51 "Model name and provider must be set in the eval config model properties" 52 ) 53 54 return model_name, ModelProviderName(provider) 55 56 async def run_task_and_eval( 57 self, input: str 58 ) -> tuple[TaskRun, EvalScores, Dict[str, str] | None]: 59 """ 60 Runs the task on the provided run_config to generate fresh output, then runs the eval on that output. 61 """ 62 if self.run_config is None: 63 raise ValueError("Run config is required for run_task_and_eval") 64 65 run_adapter = adapter_for_task( 66 self.target_task, 67 self.run_config, 68 base_adapter_config=AdapterConfig(allow_saving=False), 69 ) 70 71 # Parse structured input if needed 72 parsed_input = input 73 if self.target_task.input_json_schema is not None: 74 parsed_input = json.loads(input) 75 76 # we don't save by default here. We'll save manually after validating the output 77 run_output = await run_adapter.invoke(parsed_input) 78 79 eval_output, intermediate_outputs = await self.run_eval(run_output) 80 81 validate_schema_with_value_error( 82 eval_output, self.score_schema, "Eval output does not match score schema." 83 ) 84 85 return run_output, eval_output, intermediate_outputs 86 87 @abstractmethod 88 async def run_eval( 89 self, task_run: TaskRun 90 ) -> tuple[EvalScores, Dict[str, str] | None]: 91 """ 92 Runs the eval on the given task run. 93 94 Returns a dictionary of scores which should conform to the score schema, and a dictionary of intermediate outputs (eval thinking). 95 """ 96 pass 97 98 @classmethod 99 def build_score_schema(cls, eval: Eval, allow_float_scores: bool = False) -> str: 100 """ 101 Build a JSON schema for the scoring output of the task requirements 102 103 We allow 2 modes: allow_float_scores=True and allow_float_scores=False. 104 105 allow_float_scores=False is used for the call to the model, and forces the model into selecting into discrete rating options (int 1-5, pass-fail, etc). 106 allow_float_scores=True is used for final score output (for example, after we take a g-eval weighting of the model's logprobs). A pass/fail rating might return 0.75 for likely pass (as opposed to 0.99 for near certain pass), or a 1-5 score might return 3.75. 107 """ 108 109 # Note: python maintains order, which is good as we want the user defined order, and overall last 110 properties = {} 111 for output_score in eval.output_scores: 112 output_score_json_key = output_score.json_key() 113 114 if len(output_score_json_key) == 0: 115 raise ValueError( 116 f"Invalid output score name: {output_score.name}. Can not be used as JSON schema key." 117 ) 118 property: dict[str, str | int | float | list[str] | list[int]] = { 119 "title": output_score.name, 120 } 121 match output_score.type: 122 case TaskOutputRatingType.five_star: 123 if allow_float_scores: 124 property["type"] = "number" 125 property["minimum"] = 1 126 property["maximum"] = 5 127 else: 128 property["enum"] = [1, 2, 3, 4, 5] 129 130 property["description"] = ( 131 f"{output_score.instruction}\n\nThe rating should be between 1 and 5, with 1 being the worst and 5 being the best." 132 ) 133 case TaskOutputRatingType.pass_fail: 134 if allow_float_scores: 135 property["type"] = "number" 136 property["minimum"] = 0 137 property["maximum"] = 1 138 property["description"] = ( 139 f"{output_score.instruction}\n\nThe rating should be between 0 and 1, with 0 being a failure and 1 being a pass." 140 ) 141 else: 142 property["enum"] = ["pass", "fail"] 143 property["description"] = ( 144 f"{output_score.instruction}\n\nThe rating should be either 'pass' or 'fail'." 145 ) 146 case TaskOutputRatingType.pass_fail_critical: 147 if allow_float_scores: 148 property["type"] = "number" 149 property["minimum"] = -1 150 property["maximum"] = 1 151 property["description"] = ( 152 f"{output_score.instruction}\n\nThe rating should be between -1 and 1, with 1 being a pass, 0 being a failure, and -1 being a critical failure (very severe failure)." 153 ) 154 else: 155 property["enum"] = ["pass", "fail", "critical"] 156 property["description"] = ( 157 f"{output_score.instruction}\n\nThe rating should be either 'pass', 'fail', or 'critical' where critical a very severe failure." 158 ) 159 case TaskOutputRatingType.custom: 160 # Skip custom rating types in evals 161 continue 162 case _: 163 raise_exhaustive_enum_error(output_score.type) 164 165 properties[output_score_json_key] = property 166 167 schema = { 168 "type": "object", 169 "properties": properties, 170 "required": list(properties.keys()), 171 } 172 return json.dumps(schema, ensure_ascii=False)
Base class for all evals/evaluators.
Should be subclassed, and the run_eval method implemented.
27 def __init__(self, eval_config: EvalConfig, run_config: RunConfig | None): 28 self.eval_config = eval_config 29 eval = eval_config.parent_eval() 30 if not eval: 31 raise ValueError("Eval config must have a parent eval") 32 self.eval = eval 33 task = self.eval.parent_task() 34 if not task: 35 raise ValueError("Eval must have a parent task") 36 self.target_task = task 37 self.score_schema = BaseEval.build_score_schema(eval, allow_float_scores=True) 38 self.run_config = run_config
40 def model_and_provider(self) -> tuple[str, ModelProviderName]: 41 model_name = self.eval_config.model_name 42 provider = self.eval_config.model_provider 43 if ( 44 not model_name 45 or not provider 46 or not isinstance(model_name, str) 47 or not isinstance(provider, str) 48 or provider not in ModelProviderName.__members__ 49 ): 50 raise ValueError( 51 "Model name and provider must be set in the eval config model properties" 52 ) 53 54 return model_name, ModelProviderName(provider)
56 async def run_task_and_eval( 57 self, input: str 58 ) -> tuple[TaskRun, EvalScores, Dict[str, str] | None]: 59 """ 60 Runs the task on the provided run_config to generate fresh output, then runs the eval on that output. 61 """ 62 if self.run_config is None: 63 raise ValueError("Run config is required for run_task_and_eval") 64 65 run_adapter = adapter_for_task( 66 self.target_task, 67 self.run_config, 68 base_adapter_config=AdapterConfig(allow_saving=False), 69 ) 70 71 # Parse structured input if needed 72 parsed_input = input 73 if self.target_task.input_json_schema is not None: 74 parsed_input = json.loads(input) 75 76 # we don't save by default here. We'll save manually after validating the output 77 run_output = await run_adapter.invoke(parsed_input) 78 79 eval_output, intermediate_outputs = await self.run_eval(run_output) 80 81 validate_schema_with_value_error( 82 eval_output, self.score_schema, "Eval output does not match score schema." 83 ) 84 85 return run_output, eval_output, intermediate_outputs
Runs the task on the provided run_config to generate fresh output, then runs the eval on that output.
87 @abstractmethod 88 async def run_eval( 89 self, task_run: TaskRun 90 ) -> tuple[EvalScores, Dict[str, str] | None]: 91 """ 92 Runs the eval on the given task run. 93 94 Returns a dictionary of scores which should conform to the score schema, and a dictionary of intermediate outputs (eval thinking). 95 """ 96 pass
Runs the eval on the given task run.
Returns a dictionary of scores which should conform to the score schema, and a dictionary of intermediate outputs (eval thinking).
98 @classmethod 99 def build_score_schema(cls, eval: Eval, allow_float_scores: bool = False) -> str: 100 """ 101 Build a JSON schema for the scoring output of the task requirements 102 103 We allow 2 modes: allow_float_scores=True and allow_float_scores=False. 104 105 allow_float_scores=False is used for the call to the model, and forces the model into selecting into discrete rating options (int 1-5, pass-fail, etc). 106 allow_float_scores=True is used for final score output (for example, after we take a g-eval weighting of the model's logprobs). A pass/fail rating might return 0.75 for likely pass (as opposed to 0.99 for near certain pass), or a 1-5 score might return 3.75. 107 """ 108 109 # Note: python maintains order, which is good as we want the user defined order, and overall last 110 properties = {} 111 for output_score in eval.output_scores: 112 output_score_json_key = output_score.json_key() 113 114 if len(output_score_json_key) == 0: 115 raise ValueError( 116 f"Invalid output score name: {output_score.name}. Can not be used as JSON schema key." 117 ) 118 property: dict[str, str | int | float | list[str] | list[int]] = { 119 "title": output_score.name, 120 } 121 match output_score.type: 122 case TaskOutputRatingType.five_star: 123 if allow_float_scores: 124 property["type"] = "number" 125 property["minimum"] = 1 126 property["maximum"] = 5 127 else: 128 property["enum"] = [1, 2, 3, 4, 5] 129 130 property["description"] = ( 131 f"{output_score.instruction}\n\nThe rating should be between 1 and 5, with 1 being the worst and 5 being the best." 132 ) 133 case TaskOutputRatingType.pass_fail: 134 if allow_float_scores: 135 property["type"] = "number" 136 property["minimum"] = 0 137 property["maximum"] = 1 138 property["description"] = ( 139 f"{output_score.instruction}\n\nThe rating should be between 0 and 1, with 0 being a failure and 1 being a pass." 140 ) 141 else: 142 property["enum"] = ["pass", "fail"] 143 property["description"] = ( 144 f"{output_score.instruction}\n\nThe rating should be either 'pass' or 'fail'." 145 ) 146 case TaskOutputRatingType.pass_fail_critical: 147 if allow_float_scores: 148 property["type"] = "number" 149 property["minimum"] = -1 150 property["maximum"] = 1 151 property["description"] = ( 152 f"{output_score.instruction}\n\nThe rating should be between -1 and 1, with 1 being a pass, 0 being a failure, and -1 being a critical failure (very severe failure)." 153 ) 154 else: 155 property["enum"] = ["pass", "fail", "critical"] 156 property["description"] = ( 157 f"{output_score.instruction}\n\nThe rating should be either 'pass', 'fail', or 'critical' where critical a very severe failure." 158 ) 159 case TaskOutputRatingType.custom: 160 # Skip custom rating types in evals 161 continue 162 case _: 163 raise_exhaustive_enum_error(output_score.type) 164 165 properties[output_score_json_key] = property 166 167 schema = { 168 "type": "object", 169 "properties": properties, 170 "required": list(properties.keys()), 171 } 172 return json.dumps(schema, ensure_ascii=False)
Build a JSON schema for the scoring output of the task requirements
We allow 2 modes: allow_float_scores=True and allow_float_scores=False.
allow_float_scores=False is used for the call to the model, and forces the model into selecting into discrete rating options (int 1-5, pass-fail, etc). allow_float_scores=True is used for final score output (for example, after we take a g-eval weighting of the model's logprobs). A pass/fail rating might return 0.75 for likely pass (as opposed to 0.99 for near certain pass), or a 1-5 score might return 3.75.