kiln_ai.adapters.eval.base_eval
1import json 2from abc import abstractmethod 3from typing import Dict 4 5import jsonschema 6 7from kiln_ai.adapters.adapter_registry import adapter_for_task 8from kiln_ai.adapters.ml_model_list import ModelProviderName 9from kiln_ai.adapters.model_adapters.base_adapter import AdapterConfig 10from kiln_ai.datamodel.eval import Eval, EvalConfig, EvalScores 11from kiln_ai.datamodel.json_schema import validate_schema_with_value_error 12from kiln_ai.datamodel.task import RunConfig, TaskOutputRatingType, TaskRun 13from kiln_ai.utils.exhaustive_error import raise_exhaustive_enum_error 14 15 16class BaseEval: 17 """ 18 Base class for all evals/evaluators. 19 20 Should be subclassed, and the run_eval method implemented. 21 """ 22 23 def __init__(self, eval_config: EvalConfig, run_config: RunConfig | None): 24 self.eval_config = eval_config 25 eval = eval_config.parent_eval() 26 if not eval: 27 raise ValueError("Eval config must have a parent eval") 28 self.eval = eval 29 task = self.eval.parent_task() 30 if not task: 31 raise ValueError("Eval must have a parent task") 32 self.target_task = task 33 self.score_schema = BaseEval.build_score_schema(eval, allow_float_scores=True) 34 self.run_config = run_config 35 36 def model_and_provider(self) -> tuple[str, ModelProviderName]: 37 model_name = self.eval_config.model_name 38 provider = self.eval_config.model_provider 39 if ( 40 not model_name 41 or not provider 42 or not isinstance(model_name, str) 43 or not isinstance(provider, str) 44 or provider not in ModelProviderName.__members__ 45 ): 46 raise ValueError( 47 "Model name and provider must be set in the eval config model properties" 48 ) 49 50 return model_name, ModelProviderName(provider) 51 52 async def run_task_and_eval( 53 self, input: str 54 ) -> tuple[TaskRun, EvalScores, Dict[str, str] | None]: 55 """ 56 Runs the task on the provided run_config to generate fresh output, then runs the eval on that output. 57 """ 58 if self.run_config is None: 59 raise ValueError("Run config is required for run_task_and_eval") 60 61 run_adapter = adapter_for_task( 62 self.target_task, 63 self.run_config.model_name, 64 ModelProviderName(self.run_config.model_provider_name), 65 base_adapter_config=AdapterConfig(allow_saving=False), 66 ) 67 68 # Parse structured input if needed 69 parsed_input = input 70 if self.target_task.output_json_schema is not None: 71 parsed_input = json.loads(input) 72 73 # we don't save by default here. We'll save manually after validating the output 74 run_output = await run_adapter.invoke(parsed_input) 75 76 eval_output, intermediate_outputs = await self.run_eval(run_output) 77 78 validate_schema_with_value_error( 79 eval_output, self.score_schema, "Eval output does not match score schema." 80 ) 81 82 return run_output, eval_output, intermediate_outputs 83 84 @abstractmethod 85 async def run_eval( 86 self, task_run: TaskRun 87 ) -> tuple[EvalScores, Dict[str, str] | None]: 88 """ 89 Runs the eval on the given task run. 90 91 Returns a dictionary of scores which should conform to the score schema, and a dictionary of intermediate outputs (eval thinking). 92 """ 93 pass 94 95 @classmethod 96 def build_score_schema(cls, eval: Eval, allow_float_scores: bool = False) -> str: 97 """ 98 Build a JSON schema for the scoring output of the task requirements 99 100 We allow 2 modes: allow_float_scores=True and allow_float_scores=False. 101 102 allow_float_scores=False is used for the call to the model, and forces the model into selecting into discrete rating options (int 1-5, pass-fail, etc). 103 allow_float_scores=True is used for final score output (for example, after we take a g-eval weighting of the model's logprobs). A pass/fail rating might return 0.75 for likely pass (as opposed to 0.99 for near certain pass), or a 1-5 score might return 3.75. 104 """ 105 106 # Note: python maintains order, which is good as we want the user defined order, and overall last 107 properties = {} 108 for output_score in eval.output_scores: 109 output_score_json_key = output_score.json_key() 110 111 if len(output_score_json_key) == 0: 112 raise ValueError( 113 f"Invalid output score name: {output_score.name}. Can not be used as JSON schema key." 114 ) 115 property: dict[str, str | int | float | list[str] | list[int]] = { 116 "title": output_score.name, 117 } 118 match output_score.type: 119 case TaskOutputRatingType.five_star: 120 if allow_float_scores: 121 property["type"] = "number" 122 property["minimum"] = 1 123 property["maximum"] = 5 124 else: 125 property["enum"] = [1, 2, 3, 4, 5] 126 127 property["description"] = ( 128 f"{output_score.instruction}\n\nThe rating should be between 1 and 5, with 1 being the worst and 5 being the best." 129 ) 130 case TaskOutputRatingType.pass_fail: 131 if allow_float_scores: 132 property["type"] = "number" 133 property["minimum"] = 0 134 property["maximum"] = 1 135 property["description"] = ( 136 f"{output_score.instruction}\n\nThe rating should be between 0 and 1, with 0 being a failure and 1 being a pass." 137 ) 138 else: 139 property["enum"] = ["pass", "fail"] 140 property["description"] = ( 141 f"{output_score.instruction}\n\nThe rating should be either 'pass' or 'fail'." 142 ) 143 case TaskOutputRatingType.pass_fail_critical: 144 if allow_float_scores: 145 property["type"] = "number" 146 property["minimum"] = -1 147 property["maximum"] = 1 148 property["description"] = ( 149 f"{output_score.instruction}\n\nThe rating should be between -1 and 1, with 1 being a pass, 0 being a failure, and -1 being a critical failure (very severe failure)." 150 ) 151 else: 152 property["enum"] = ["pass", "fail", "critical"] 153 property["description"] = ( 154 f"{output_score.instruction}\n\nThe rating should be either 'pass', 'fail', or 'critical' where critical a very severe failure." 155 ) 156 case TaskOutputRatingType.custom: 157 # Skip custom rating types in evals 158 continue 159 case _: 160 raise_exhaustive_enum_error(output_score.type) 161 162 properties[output_score_json_key] = property 163 164 schema = { 165 "type": "object", 166 "properties": properties, 167 "required": list(properties.keys()), 168 } 169 return json.dumps(schema, ensure_ascii=False)
17class BaseEval: 18 """ 19 Base class for all evals/evaluators. 20 21 Should be subclassed, and the run_eval method implemented. 22 """ 23 24 def __init__(self, eval_config: EvalConfig, run_config: RunConfig | None): 25 self.eval_config = eval_config 26 eval = eval_config.parent_eval() 27 if not eval: 28 raise ValueError("Eval config must have a parent eval") 29 self.eval = eval 30 task = self.eval.parent_task() 31 if not task: 32 raise ValueError("Eval must have a parent task") 33 self.target_task = task 34 self.score_schema = BaseEval.build_score_schema(eval, allow_float_scores=True) 35 self.run_config = run_config 36 37 def model_and_provider(self) -> tuple[str, ModelProviderName]: 38 model_name = self.eval_config.model_name 39 provider = self.eval_config.model_provider 40 if ( 41 not model_name 42 or not provider 43 or not isinstance(model_name, str) 44 or not isinstance(provider, str) 45 or provider not in ModelProviderName.__members__ 46 ): 47 raise ValueError( 48 "Model name and provider must be set in the eval config model properties" 49 ) 50 51 return model_name, ModelProviderName(provider) 52 53 async def run_task_and_eval( 54 self, input: str 55 ) -> tuple[TaskRun, EvalScores, Dict[str, str] | None]: 56 """ 57 Runs the task on the provided run_config to generate fresh output, then runs the eval on that output. 58 """ 59 if self.run_config is None: 60 raise ValueError("Run config is required for run_task_and_eval") 61 62 run_adapter = adapter_for_task( 63 self.target_task, 64 self.run_config.model_name, 65 ModelProviderName(self.run_config.model_provider_name), 66 base_adapter_config=AdapterConfig(allow_saving=False), 67 ) 68 69 # Parse structured input if needed 70 parsed_input = input 71 if self.target_task.output_json_schema is not None: 72 parsed_input = json.loads(input) 73 74 # we don't save by default here. We'll save manually after validating the output 75 run_output = await run_adapter.invoke(parsed_input) 76 77 eval_output, intermediate_outputs = await self.run_eval(run_output) 78 79 validate_schema_with_value_error( 80 eval_output, self.score_schema, "Eval output does not match score schema." 81 ) 82 83 return run_output, eval_output, intermediate_outputs 84 85 @abstractmethod 86 async def run_eval( 87 self, task_run: TaskRun 88 ) -> tuple[EvalScores, Dict[str, str] | None]: 89 """ 90 Runs the eval on the given task run. 91 92 Returns a dictionary of scores which should conform to the score schema, and a dictionary of intermediate outputs (eval thinking). 93 """ 94 pass 95 96 @classmethod 97 def build_score_schema(cls, eval: Eval, allow_float_scores: bool = False) -> str: 98 """ 99 Build a JSON schema for the scoring output of the task requirements 100 101 We allow 2 modes: allow_float_scores=True and allow_float_scores=False. 102 103 allow_float_scores=False is used for the call to the model, and forces the model into selecting into discrete rating options (int 1-5, pass-fail, etc). 104 allow_float_scores=True is used for final score output (for example, after we take a g-eval weighting of the model's logprobs). A pass/fail rating might return 0.75 for likely pass (as opposed to 0.99 for near certain pass), or a 1-5 score might return 3.75. 105 """ 106 107 # Note: python maintains order, which is good as we want the user defined order, and overall last 108 properties = {} 109 for output_score in eval.output_scores: 110 output_score_json_key = output_score.json_key() 111 112 if len(output_score_json_key) == 0: 113 raise ValueError( 114 f"Invalid output score name: {output_score.name}. Can not be used as JSON schema key." 115 ) 116 property: dict[str, str | int | float | list[str] | list[int]] = { 117 "title": output_score.name, 118 } 119 match output_score.type: 120 case TaskOutputRatingType.five_star: 121 if allow_float_scores: 122 property["type"] = "number" 123 property["minimum"] = 1 124 property["maximum"] = 5 125 else: 126 property["enum"] = [1, 2, 3, 4, 5] 127 128 property["description"] = ( 129 f"{output_score.instruction}\n\nThe rating should be between 1 and 5, with 1 being the worst and 5 being the best." 130 ) 131 case TaskOutputRatingType.pass_fail: 132 if allow_float_scores: 133 property["type"] = "number" 134 property["minimum"] = 0 135 property["maximum"] = 1 136 property["description"] = ( 137 f"{output_score.instruction}\n\nThe rating should be between 0 and 1, with 0 being a failure and 1 being a pass." 138 ) 139 else: 140 property["enum"] = ["pass", "fail"] 141 property["description"] = ( 142 f"{output_score.instruction}\n\nThe rating should be either 'pass' or 'fail'." 143 ) 144 case TaskOutputRatingType.pass_fail_critical: 145 if allow_float_scores: 146 property["type"] = "number" 147 property["minimum"] = -1 148 property["maximum"] = 1 149 property["description"] = ( 150 f"{output_score.instruction}\n\nThe rating should be between -1 and 1, with 1 being a pass, 0 being a failure, and -1 being a critical failure (very severe failure)." 151 ) 152 else: 153 property["enum"] = ["pass", "fail", "critical"] 154 property["description"] = ( 155 f"{output_score.instruction}\n\nThe rating should be either 'pass', 'fail', or 'critical' where critical a very severe failure." 156 ) 157 case TaskOutputRatingType.custom: 158 # Skip custom rating types in evals 159 continue 160 case _: 161 raise_exhaustive_enum_error(output_score.type) 162 163 properties[output_score_json_key] = property 164 165 schema = { 166 "type": "object", 167 "properties": properties, 168 "required": list(properties.keys()), 169 } 170 return json.dumps(schema, ensure_ascii=False)
Base class for all evals/evaluators.
Should be subclassed, and the run_eval method implemented.
24 def __init__(self, eval_config: EvalConfig, run_config: RunConfig | None): 25 self.eval_config = eval_config 26 eval = eval_config.parent_eval() 27 if not eval: 28 raise ValueError("Eval config must have a parent eval") 29 self.eval = eval 30 task = self.eval.parent_task() 31 if not task: 32 raise ValueError("Eval must have a parent task") 33 self.target_task = task 34 self.score_schema = BaseEval.build_score_schema(eval, allow_float_scores=True) 35 self.run_config = run_config
37 def model_and_provider(self) -> tuple[str, ModelProviderName]: 38 model_name = self.eval_config.model_name 39 provider = self.eval_config.model_provider 40 if ( 41 not model_name 42 or not provider 43 or not isinstance(model_name, str) 44 or not isinstance(provider, str) 45 or provider not in ModelProviderName.__members__ 46 ): 47 raise ValueError( 48 "Model name and provider must be set in the eval config model properties" 49 ) 50 51 return model_name, ModelProviderName(provider)
53 async def run_task_and_eval( 54 self, input: str 55 ) -> tuple[TaskRun, EvalScores, Dict[str, str] | None]: 56 """ 57 Runs the task on the provided run_config to generate fresh output, then runs the eval on that output. 58 """ 59 if self.run_config is None: 60 raise ValueError("Run config is required for run_task_and_eval") 61 62 run_adapter = adapter_for_task( 63 self.target_task, 64 self.run_config.model_name, 65 ModelProviderName(self.run_config.model_provider_name), 66 base_adapter_config=AdapterConfig(allow_saving=False), 67 ) 68 69 # Parse structured input if needed 70 parsed_input = input 71 if self.target_task.output_json_schema is not None: 72 parsed_input = json.loads(input) 73 74 # we don't save by default here. We'll save manually after validating the output 75 run_output = await run_adapter.invoke(parsed_input) 76 77 eval_output, intermediate_outputs = await self.run_eval(run_output) 78 79 validate_schema_with_value_error( 80 eval_output, self.score_schema, "Eval output does not match score schema." 81 ) 82 83 return run_output, eval_output, intermediate_outputs
Runs the task on the provided run_config to generate fresh output, then runs the eval on that output.
85 @abstractmethod 86 async def run_eval( 87 self, task_run: TaskRun 88 ) -> tuple[EvalScores, Dict[str, str] | None]: 89 """ 90 Runs the eval on the given task run. 91 92 Returns a dictionary of scores which should conform to the score schema, and a dictionary of intermediate outputs (eval thinking). 93 """ 94 pass
Runs the eval on the given task run.
Returns a dictionary of scores which should conform to the score schema, and a dictionary of intermediate outputs (eval thinking).
96 @classmethod 97 def build_score_schema(cls, eval: Eval, allow_float_scores: bool = False) -> str: 98 """ 99 Build a JSON schema for the scoring output of the task requirements 100 101 We allow 2 modes: allow_float_scores=True and allow_float_scores=False. 102 103 allow_float_scores=False is used for the call to the model, and forces the model into selecting into discrete rating options (int 1-5, pass-fail, etc). 104 allow_float_scores=True is used for final score output (for example, after we take a g-eval weighting of the model's logprobs). A pass/fail rating might return 0.75 for likely pass (as opposed to 0.99 for near certain pass), or a 1-5 score might return 3.75. 105 """ 106 107 # Note: python maintains order, which is good as we want the user defined order, and overall last 108 properties = {} 109 for output_score in eval.output_scores: 110 output_score_json_key = output_score.json_key() 111 112 if len(output_score_json_key) == 0: 113 raise ValueError( 114 f"Invalid output score name: {output_score.name}. Can not be used as JSON schema key." 115 ) 116 property: dict[str, str | int | float | list[str] | list[int]] = { 117 "title": output_score.name, 118 } 119 match output_score.type: 120 case TaskOutputRatingType.five_star: 121 if allow_float_scores: 122 property["type"] = "number" 123 property["minimum"] = 1 124 property["maximum"] = 5 125 else: 126 property["enum"] = [1, 2, 3, 4, 5] 127 128 property["description"] = ( 129 f"{output_score.instruction}\n\nThe rating should be between 1 and 5, with 1 being the worst and 5 being the best." 130 ) 131 case TaskOutputRatingType.pass_fail: 132 if allow_float_scores: 133 property["type"] = "number" 134 property["minimum"] = 0 135 property["maximum"] = 1 136 property["description"] = ( 137 f"{output_score.instruction}\n\nThe rating should be between 0 and 1, with 0 being a failure and 1 being a pass." 138 ) 139 else: 140 property["enum"] = ["pass", "fail"] 141 property["description"] = ( 142 f"{output_score.instruction}\n\nThe rating should be either 'pass' or 'fail'." 143 ) 144 case TaskOutputRatingType.pass_fail_critical: 145 if allow_float_scores: 146 property["type"] = "number" 147 property["minimum"] = -1 148 property["maximum"] = 1 149 property["description"] = ( 150 f"{output_score.instruction}\n\nThe rating should be between -1 and 1, with 1 being a pass, 0 being a failure, and -1 being a critical failure (very severe failure)." 151 ) 152 else: 153 property["enum"] = ["pass", "fail", "critical"] 154 property["description"] = ( 155 f"{output_score.instruction}\n\nThe rating should be either 'pass', 'fail', or 'critical' where critical a very severe failure." 156 ) 157 case TaskOutputRatingType.custom: 158 # Skip custom rating types in evals 159 continue 160 case _: 161 raise_exhaustive_enum_error(output_score.type) 162 163 properties[output_score_json_key] = property 164 165 schema = { 166 "type": "object", 167 "properties": properties, 168 "required": list(properties.keys()), 169 } 170 return json.dumps(schema, ensure_ascii=False)
Build a JSON schema for the scoring output of the task requirements
We allow 2 modes: allow_float_scores=True and allow_float_scores=False.
allow_float_scores=False is used for the call to the model, and forces the model into selecting into discrete rating options (int 1-5, pass-fail, etc). allow_float_scores=True is used for final score output (for example, after we take a g-eval weighting of the model's logprobs). A pass/fail rating might return 0.75 for likely pass (as opposed to 0.99 for near certain pass), or a 1-5 score might return 3.75.