kiln_ai.adapters.eval.base_eval
1import json 2from abc import abstractmethod 3from typing import Dict 4 5from kiln_ai.adapters.adapter_registry import adapter_for_task 6from kiln_ai.adapters.ml_model_list import ModelProviderName 7from kiln_ai.adapters.model_adapters.base_adapter import AdapterConfig 8from kiln_ai.datamodel.eval import Eval, EvalConfig, EvalScores 9from kiln_ai.datamodel.json_schema import validate_schema_with_value_error 10from kiln_ai.datamodel.task import RunConfigProperties, TaskOutputRatingType, TaskRun 11from kiln_ai.utils.exhaustive_error import raise_exhaustive_enum_error 12 13 14class BaseEval: 15 """ 16 Base class for all evals/evaluators. 17 18 Should be subclassed, and the run_eval method implemented. 19 """ 20 21 def __init__(self, eval_config: EvalConfig, run_config: RunConfigProperties | None): 22 self.eval_config = eval_config 23 eval = eval_config.parent_eval() 24 if not eval: 25 raise ValueError("Eval config must have a parent eval") 26 self.eval = eval 27 task = self.eval.parent_task() 28 if not task: 29 raise ValueError("Eval must have a parent task") 30 self.target_task = task 31 self.score_schema = BaseEval.build_score_schema(eval, allow_float_scores=True) 32 self.run_config = run_config 33 34 def model_and_provider(self) -> tuple[str, ModelProviderName]: 35 model_name = self.eval_config.model_name 36 provider = self.eval_config.model_provider 37 if ( 38 not model_name 39 or not provider 40 or not isinstance(model_name, str) 41 or not isinstance(provider, str) 42 or provider not in ModelProviderName.__members__ 43 ): 44 raise ValueError( 45 "Model name and provider must be set in the eval config model properties" 46 ) 47 48 return model_name, ModelProviderName(provider) 49 50 async def run_task_and_eval( 51 self, input: str 52 ) -> tuple[TaskRun, EvalScores, Dict[str, str] | None]: 53 """ 54 Runs the task on the provided run_config to generate fresh output, then runs the eval on that output. 55 """ 56 if self.run_config is None: 57 raise ValueError("Run config is required for run_task_and_eval") 58 59 run_adapter = adapter_for_task( 60 self.target_task, 61 self.run_config, 62 base_adapter_config=AdapterConfig(allow_saving=False), 63 ) 64 65 # Parse structured input if needed 66 parsed_input = input 67 if self.target_task.input_json_schema is not None: 68 parsed_input = json.loads(input) 69 70 # we don't save by default here. We'll save manually after validating the output 71 run_output = await run_adapter.invoke(parsed_input) 72 73 eval_output, intermediate_outputs = await self.run_eval(run_output) 74 75 validate_schema_with_value_error( 76 eval_output, self.score_schema, "Eval output does not match score schema." 77 ) 78 79 return run_output, eval_output, intermediate_outputs 80 81 @abstractmethod 82 async def run_eval( 83 self, task_run: TaskRun 84 ) -> tuple[EvalScores, Dict[str, str] | None]: 85 """ 86 Runs the eval on the given task run. 87 88 Returns a dictionary of scores which should conform to the score schema, and a dictionary of intermediate outputs (eval thinking). 89 """ 90 pass 91 92 @classmethod 93 def build_score_schema(cls, eval: Eval, allow_float_scores: bool = False) -> str: 94 """ 95 Build a JSON schema for the scoring output of the task requirements 96 97 We allow 2 modes: allow_float_scores=True and allow_float_scores=False. 98 99 allow_float_scores=False is used for the call to the model, and forces the model into selecting into discrete rating options (int 1-5, pass-fail, etc). 100 allow_float_scores=True is used for final score output (for example, after we take a g-eval weighting of the model's logprobs). A pass/fail rating might return 0.75 for likely pass (as opposed to 0.99 for near certain pass), or a 1-5 score might return 3.75. 101 """ 102 103 # Note: python maintains order, which is good as we want the user defined order, and overall last 104 properties = {} 105 for output_score in eval.output_scores: 106 output_score_json_key = output_score.json_key() 107 108 if len(output_score_json_key) == 0: 109 raise ValueError( 110 f"Invalid output score name: {output_score.name}. Can not be used as JSON schema key." 111 ) 112 property: dict[str, str | int | float | list[str] | list[int]] = { 113 "title": output_score.name, 114 } 115 match output_score.type: 116 case TaskOutputRatingType.five_star: 117 if allow_float_scores: 118 property["type"] = "number" 119 property["minimum"] = 1 120 property["maximum"] = 5 121 else: 122 property["type"] = "integer" 123 property["minimum"] = 1 124 property["maximum"] = 5 125 126 property["description"] = ( 127 f"{output_score.instruction}\n\nThe rating should be between 1 and 5, with 1 being the worst and 5 being the best." 128 ) 129 case TaskOutputRatingType.pass_fail: 130 if allow_float_scores: 131 property["type"] = "number" 132 property["minimum"] = 0 133 property["maximum"] = 1 134 property["description"] = ( 135 f"{output_score.instruction}\n\nThe rating should be between 0 and 1, with 0 being a failure and 1 being a pass." 136 ) 137 else: 138 property["enum"] = ["pass", "fail"] 139 property["type"] = "string" 140 property["description"] = ( 141 f"{output_score.instruction}\n\nThe rating should be either 'pass' or 'fail'." 142 ) 143 case TaskOutputRatingType.pass_fail_critical: 144 if allow_float_scores: 145 property["type"] = "number" 146 property["minimum"] = -1 147 property["maximum"] = 1 148 property["description"] = ( 149 f"{output_score.instruction}\n\nThe rating should be between -1 and 1, with 1 being a pass, 0 being a failure, and -1 being a critical failure (very severe failure)." 150 ) 151 else: 152 property["enum"] = ["pass", "fail", "critical"] 153 property["type"] = "string" 154 property["description"] = ( 155 f"{output_score.instruction}\n\nThe rating should be either 'pass', 'fail', or 'critical' where critical a very severe failure." 156 ) 157 case TaskOutputRatingType.custom: 158 # Skip custom rating types in evals 159 continue 160 case _: 161 raise_exhaustive_enum_error(output_score.type) 162 163 properties[output_score_json_key] = property 164 165 schema = { 166 "type": "object", 167 "properties": properties, 168 "required": list(properties.keys()), 169 } 170 return json.dumps(schema, ensure_ascii=False)
15class BaseEval: 16 """ 17 Base class for all evals/evaluators. 18 19 Should be subclassed, and the run_eval method implemented. 20 """ 21 22 def __init__(self, eval_config: EvalConfig, run_config: RunConfigProperties | None): 23 self.eval_config = eval_config 24 eval = eval_config.parent_eval() 25 if not eval: 26 raise ValueError("Eval config must have a parent eval") 27 self.eval = eval 28 task = self.eval.parent_task() 29 if not task: 30 raise ValueError("Eval must have a parent task") 31 self.target_task = task 32 self.score_schema = BaseEval.build_score_schema(eval, allow_float_scores=True) 33 self.run_config = run_config 34 35 def model_and_provider(self) -> tuple[str, ModelProviderName]: 36 model_name = self.eval_config.model_name 37 provider = self.eval_config.model_provider 38 if ( 39 not model_name 40 or not provider 41 or not isinstance(model_name, str) 42 or not isinstance(provider, str) 43 or provider not in ModelProviderName.__members__ 44 ): 45 raise ValueError( 46 "Model name and provider must be set in the eval config model properties" 47 ) 48 49 return model_name, ModelProviderName(provider) 50 51 async def run_task_and_eval( 52 self, input: str 53 ) -> tuple[TaskRun, EvalScores, Dict[str, str] | None]: 54 """ 55 Runs the task on the provided run_config to generate fresh output, then runs the eval on that output. 56 """ 57 if self.run_config is None: 58 raise ValueError("Run config is required for run_task_and_eval") 59 60 run_adapter = adapter_for_task( 61 self.target_task, 62 self.run_config, 63 base_adapter_config=AdapterConfig(allow_saving=False), 64 ) 65 66 # Parse structured input if needed 67 parsed_input = input 68 if self.target_task.input_json_schema is not None: 69 parsed_input = json.loads(input) 70 71 # we don't save by default here. We'll save manually after validating the output 72 run_output = await run_adapter.invoke(parsed_input) 73 74 eval_output, intermediate_outputs = await self.run_eval(run_output) 75 76 validate_schema_with_value_error( 77 eval_output, self.score_schema, "Eval output does not match score schema." 78 ) 79 80 return run_output, eval_output, intermediate_outputs 81 82 @abstractmethod 83 async def run_eval( 84 self, task_run: TaskRun 85 ) -> tuple[EvalScores, Dict[str, str] | None]: 86 """ 87 Runs the eval on the given task run. 88 89 Returns a dictionary of scores which should conform to the score schema, and a dictionary of intermediate outputs (eval thinking). 90 """ 91 pass 92 93 @classmethod 94 def build_score_schema(cls, eval: Eval, allow_float_scores: bool = False) -> str: 95 """ 96 Build a JSON schema for the scoring output of the task requirements 97 98 We allow 2 modes: allow_float_scores=True and allow_float_scores=False. 99 100 allow_float_scores=False is used for the call to the model, and forces the model into selecting into discrete rating options (int 1-5, pass-fail, etc). 101 allow_float_scores=True is used for final score output (for example, after we take a g-eval weighting of the model's logprobs). A pass/fail rating might return 0.75 for likely pass (as opposed to 0.99 for near certain pass), or a 1-5 score might return 3.75. 102 """ 103 104 # Note: python maintains order, which is good as we want the user defined order, and overall last 105 properties = {} 106 for output_score in eval.output_scores: 107 output_score_json_key = output_score.json_key() 108 109 if len(output_score_json_key) == 0: 110 raise ValueError( 111 f"Invalid output score name: {output_score.name}. Can not be used as JSON schema key." 112 ) 113 property: dict[str, str | int | float | list[str] | list[int]] = { 114 "title": output_score.name, 115 } 116 match output_score.type: 117 case TaskOutputRatingType.five_star: 118 if allow_float_scores: 119 property["type"] = "number" 120 property["minimum"] = 1 121 property["maximum"] = 5 122 else: 123 property["type"] = "integer" 124 property["minimum"] = 1 125 property["maximum"] = 5 126 127 property["description"] = ( 128 f"{output_score.instruction}\n\nThe rating should be between 1 and 5, with 1 being the worst and 5 being the best." 129 ) 130 case TaskOutputRatingType.pass_fail: 131 if allow_float_scores: 132 property["type"] = "number" 133 property["minimum"] = 0 134 property["maximum"] = 1 135 property["description"] = ( 136 f"{output_score.instruction}\n\nThe rating should be between 0 and 1, with 0 being a failure and 1 being a pass." 137 ) 138 else: 139 property["enum"] = ["pass", "fail"] 140 property["type"] = "string" 141 property["description"] = ( 142 f"{output_score.instruction}\n\nThe rating should be either 'pass' or 'fail'." 143 ) 144 case TaskOutputRatingType.pass_fail_critical: 145 if allow_float_scores: 146 property["type"] = "number" 147 property["minimum"] = -1 148 property["maximum"] = 1 149 property["description"] = ( 150 f"{output_score.instruction}\n\nThe rating should be between -1 and 1, with 1 being a pass, 0 being a failure, and -1 being a critical failure (very severe failure)." 151 ) 152 else: 153 property["enum"] = ["pass", "fail", "critical"] 154 property["type"] = "string" 155 property["description"] = ( 156 f"{output_score.instruction}\n\nThe rating should be either 'pass', 'fail', or 'critical' where critical a very severe failure." 157 ) 158 case TaskOutputRatingType.custom: 159 # Skip custom rating types in evals 160 continue 161 case _: 162 raise_exhaustive_enum_error(output_score.type) 163 164 properties[output_score_json_key] = property 165 166 schema = { 167 "type": "object", 168 "properties": properties, 169 "required": list(properties.keys()), 170 } 171 return json.dumps(schema, ensure_ascii=False)
Base class for all evals/evaluators.
Should be subclassed, and the run_eval method implemented.
22 def __init__(self, eval_config: EvalConfig, run_config: RunConfigProperties | None): 23 self.eval_config = eval_config 24 eval = eval_config.parent_eval() 25 if not eval: 26 raise ValueError("Eval config must have a parent eval") 27 self.eval = eval 28 task = self.eval.parent_task() 29 if not task: 30 raise ValueError("Eval must have a parent task") 31 self.target_task = task 32 self.score_schema = BaseEval.build_score_schema(eval, allow_float_scores=True) 33 self.run_config = run_config
35 def model_and_provider(self) -> tuple[str, ModelProviderName]: 36 model_name = self.eval_config.model_name 37 provider = self.eval_config.model_provider 38 if ( 39 not model_name 40 or not provider 41 or not isinstance(model_name, str) 42 or not isinstance(provider, str) 43 or provider not in ModelProviderName.__members__ 44 ): 45 raise ValueError( 46 "Model name and provider must be set in the eval config model properties" 47 ) 48 49 return model_name, ModelProviderName(provider)
51 async def run_task_and_eval( 52 self, input: str 53 ) -> tuple[TaskRun, EvalScores, Dict[str, str] | None]: 54 """ 55 Runs the task on the provided run_config to generate fresh output, then runs the eval on that output. 56 """ 57 if self.run_config is None: 58 raise ValueError("Run config is required for run_task_and_eval") 59 60 run_adapter = adapter_for_task( 61 self.target_task, 62 self.run_config, 63 base_adapter_config=AdapterConfig(allow_saving=False), 64 ) 65 66 # Parse structured input if needed 67 parsed_input = input 68 if self.target_task.input_json_schema is not None: 69 parsed_input = json.loads(input) 70 71 # we don't save by default here. We'll save manually after validating the output 72 run_output = await run_adapter.invoke(parsed_input) 73 74 eval_output, intermediate_outputs = await self.run_eval(run_output) 75 76 validate_schema_with_value_error( 77 eval_output, self.score_schema, "Eval output does not match score schema." 78 ) 79 80 return run_output, eval_output, intermediate_outputs
Runs the task on the provided run_config to generate fresh output, then runs the eval on that output.
82 @abstractmethod 83 async def run_eval( 84 self, task_run: TaskRun 85 ) -> tuple[EvalScores, Dict[str, str] | None]: 86 """ 87 Runs the eval on the given task run. 88 89 Returns a dictionary of scores which should conform to the score schema, and a dictionary of intermediate outputs (eval thinking). 90 """ 91 pass
Runs the eval on the given task run.
Returns a dictionary of scores which should conform to the score schema, and a dictionary of intermediate outputs (eval thinking).
93 @classmethod 94 def build_score_schema(cls, eval: Eval, allow_float_scores: bool = False) -> str: 95 """ 96 Build a JSON schema for the scoring output of the task requirements 97 98 We allow 2 modes: allow_float_scores=True and allow_float_scores=False. 99 100 allow_float_scores=False is used for the call to the model, and forces the model into selecting into discrete rating options (int 1-5, pass-fail, etc). 101 allow_float_scores=True is used for final score output (for example, after we take a g-eval weighting of the model's logprobs). A pass/fail rating might return 0.75 for likely pass (as opposed to 0.99 for near certain pass), or a 1-5 score might return 3.75. 102 """ 103 104 # Note: python maintains order, which is good as we want the user defined order, and overall last 105 properties = {} 106 for output_score in eval.output_scores: 107 output_score_json_key = output_score.json_key() 108 109 if len(output_score_json_key) == 0: 110 raise ValueError( 111 f"Invalid output score name: {output_score.name}. Can not be used as JSON schema key." 112 ) 113 property: dict[str, str | int | float | list[str] | list[int]] = { 114 "title": output_score.name, 115 } 116 match output_score.type: 117 case TaskOutputRatingType.five_star: 118 if allow_float_scores: 119 property["type"] = "number" 120 property["minimum"] = 1 121 property["maximum"] = 5 122 else: 123 property["type"] = "integer" 124 property["minimum"] = 1 125 property["maximum"] = 5 126 127 property["description"] = ( 128 f"{output_score.instruction}\n\nThe rating should be between 1 and 5, with 1 being the worst and 5 being the best." 129 ) 130 case TaskOutputRatingType.pass_fail: 131 if allow_float_scores: 132 property["type"] = "number" 133 property["minimum"] = 0 134 property["maximum"] = 1 135 property["description"] = ( 136 f"{output_score.instruction}\n\nThe rating should be between 0 and 1, with 0 being a failure and 1 being a pass." 137 ) 138 else: 139 property["enum"] = ["pass", "fail"] 140 property["type"] = "string" 141 property["description"] = ( 142 f"{output_score.instruction}\n\nThe rating should be either 'pass' or 'fail'." 143 ) 144 case TaskOutputRatingType.pass_fail_critical: 145 if allow_float_scores: 146 property["type"] = "number" 147 property["minimum"] = -1 148 property["maximum"] = 1 149 property["description"] = ( 150 f"{output_score.instruction}\n\nThe rating should be between -1 and 1, with 1 being a pass, 0 being a failure, and -1 being a critical failure (very severe failure)." 151 ) 152 else: 153 property["enum"] = ["pass", "fail", "critical"] 154 property["type"] = "string" 155 property["description"] = ( 156 f"{output_score.instruction}\n\nThe rating should be either 'pass', 'fail', or 'critical' where critical a very severe failure." 157 ) 158 case TaskOutputRatingType.custom: 159 # Skip custom rating types in evals 160 continue 161 case _: 162 raise_exhaustive_enum_error(output_score.type) 163 164 properties[output_score_json_key] = property 165 166 schema = { 167 "type": "object", 168 "properties": properties, 169 "required": list(properties.keys()), 170 } 171 return json.dumps(schema, ensure_ascii=False)
Build a JSON schema for the scoring output of the task requirements
We allow 2 modes: allow_float_scores=True and allow_float_scores=False.
allow_float_scores=False is used for the call to the model, and forces the model into selecting into discrete rating options (int 1-5, pass-fail, etc). allow_float_scores=True is used for final score output (for example, after we take a g-eval weighting of the model's logprobs). A pass/fail rating might return 0.75 for likely pass (as opposed to 0.99 for near certain pass), or a 1-5 score might return 3.75.