kiln_ai.adapters.eval.base_eval
1import json 2from abc import abstractmethod 3from typing import Dict 4 5from kiln_ai.adapters.adapter_registry import adapter_for_task 6from kiln_ai.adapters.ml_model_list import ModelProviderName 7from kiln_ai.adapters.model_adapters.base_adapter import AdapterConfig, SkillsDict 8from kiln_ai.datamodel.eval import Eval, EvalConfig, EvalScores 9from kiln_ai.datamodel.json_schema import validate_schema_with_value_error 10from kiln_ai.datamodel.task import RunConfigProperties, TaskOutputRatingType, TaskRun 11from kiln_ai.utils.exhaustive_error import raise_exhaustive_enum_error 12 13 14class BaseEval: 15 """ 16 Base class for all evals/evaluators. 17 18 Should be subclassed, and the run_eval method implemented. 19 """ 20 21 def __init__( 22 self, 23 eval_config: EvalConfig, 24 run_config: RunConfigProperties | None, 25 skills: SkillsDict | None = None, 26 ): 27 self.eval_config = eval_config 28 eval = eval_config.parent_eval() 29 if not eval: 30 raise ValueError("Eval config must have a parent eval") 31 self.eval = eval 32 task = self.eval.parent_task() 33 if not task: 34 raise ValueError("Eval must have a parent task") 35 self.target_task = task 36 self.score_schema = BaseEval.build_score_schema(eval, allow_float_scores=True) 37 self.run_config = run_config 38 self.skills = skills 39 40 def model_and_provider(self) -> tuple[str, ModelProviderName]: 41 model_name = self.eval_config.model_name 42 provider = self.eval_config.model_provider 43 if ( 44 not model_name 45 or not provider 46 or not isinstance(model_name, str) 47 or not isinstance(provider, str) 48 or provider not in ModelProviderName.__members__ 49 ): 50 raise ValueError( 51 "Model name and provider must be set in the eval config model properties" 52 ) 53 54 return model_name, ModelProviderName(provider) 55 56 async def run_task_and_eval( 57 self, eval_job_item: TaskRun 58 ) -> tuple[TaskRun, EvalScores, Dict[str, str] | None]: 59 """ 60 Runs the task on the provided run_config to generate fresh output, then runs the eval on that output. 61 """ 62 input = eval_job_item.input 63 if self.run_config is None: 64 raise ValueError("Run config is required for run_task_and_eval") 65 66 run_adapter = adapter_for_task( 67 self.target_task, 68 self.run_config, 69 base_adapter_config=AdapterConfig( 70 allow_saving=False, 71 skills=self.skills, 72 ), 73 ) 74 75 # Parse structured input if needed 76 parsed_input = input 77 if self.target_task.input_json_schema is not None: 78 parsed_input = json.loads(input) 79 80 # we don't save by default here. We'll save manually after validating the output 81 run_output = await run_adapter.invoke(parsed_input) 82 83 eval_output, intermediate_outputs = await self.run_eval( 84 run_output, eval_job_item 85 ) 86 87 validate_schema_with_value_error( 88 eval_output, self.score_schema, "Eval output does not match score schema." 89 ) 90 91 return run_output, eval_output, intermediate_outputs 92 93 @abstractmethod 94 async def run_eval( 95 self, task_run: TaskRun, eval_job_item: TaskRun | None = None 96 ) -> tuple[EvalScores, Dict[str, str] | None]: 97 """ 98 Runs the eval on the given task run. 99 100 Returns a dictionary of scores which should conform to the score schema, and a dictionary of intermediate outputs (eval thinking). 101 """ 102 pass 103 104 @classmethod 105 def build_score_schema(cls, eval: Eval, allow_float_scores: bool = False) -> str: 106 """ 107 Build a JSON schema for the scoring output of the task requirements 108 109 We allow 2 modes: allow_float_scores=True and allow_float_scores=False. 110 111 allow_float_scores=False is used for the call to the model, and forces the model into selecting into discrete rating options (int 1-5, pass-fail, etc). 112 allow_float_scores=True is used for final score output (for example, after we take a g-eval weighting of the model's logprobs). A pass/fail rating might return 0.75 for likely pass (as opposed to 0.99 for near certain pass), or a 1-5 score might return 3.75. 113 """ 114 115 # Note: python maintains order, which is good as we want the user defined order, and overall last 116 properties = {} 117 for output_score in eval.output_scores: 118 output_score_json_key = output_score.json_key() 119 120 if len(output_score_json_key) == 0: 121 raise ValueError( 122 f"Invalid output score name: {output_score.name}. Can not be used as JSON schema key." 123 ) 124 property: dict[str, str | int | float | list[str] | list[int]] = { 125 "title": output_score.name, 126 } 127 128 match output_score.type: 129 case TaskOutputRatingType.five_star: 130 if allow_float_scores: 131 property["type"] = "number" 132 property["minimum"] = 1 133 property["maximum"] = 5 134 else: 135 property["type"] = "integer" 136 property["minimum"] = 1 137 property["maximum"] = 5 138 139 property["description"] = ( 140 f"{output_score.instruction}\n\nThe rating should be between 1 and 5, with 1 being the worst and 5 being the best." 141 ) 142 case TaskOutputRatingType.pass_fail: 143 if allow_float_scores: 144 property["type"] = "number" 145 property["minimum"] = 0 146 property["maximum"] = 1 147 property["description"] = ( 148 f"{output_score.instruction}\n\nThe rating should be between 0 and 1, with 0 being a failure and 1 being a pass." 149 ) 150 else: 151 property["enum"] = ["pass", "fail"] 152 property["type"] = "string" 153 property["description"] = ( 154 f"{output_score.instruction}\n\nThe rating should be either 'pass' or 'fail'." 155 ) 156 case TaskOutputRatingType.pass_fail_critical: 157 if allow_float_scores: 158 property["type"] = "number" 159 property["minimum"] = -1 160 property["maximum"] = 1 161 property["description"] = ( 162 f"{output_score.instruction}\n\nThe rating should be between -1 and 1, with 1 being a pass, 0 being a failure, and -1 being a critical failure (very severe failure)." 163 ) 164 else: 165 property["enum"] = ["pass", "fail", "critical"] 166 property["type"] = "string" 167 property["description"] = ( 168 f"{output_score.instruction}\n\nThe rating should be either 'pass', 'fail', or 'critical' where critical a very severe failure." 169 ) 170 case TaskOutputRatingType.custom: 171 # Skip custom rating types in evals 172 continue 173 case _: 174 raise_exhaustive_enum_error(output_score.type) 175 176 properties[output_score_json_key] = property 177 178 schema = { 179 "type": "object", 180 "properties": properties, 181 "required": list(properties.keys()), 182 "additionalProperties": False, 183 } 184 return json.dumps(schema, ensure_ascii=False)
15class BaseEval: 16 """ 17 Base class for all evals/evaluators. 18 19 Should be subclassed, and the run_eval method implemented. 20 """ 21 22 def __init__( 23 self, 24 eval_config: EvalConfig, 25 run_config: RunConfigProperties | None, 26 skills: SkillsDict | None = None, 27 ): 28 self.eval_config = eval_config 29 eval = eval_config.parent_eval() 30 if not eval: 31 raise ValueError("Eval config must have a parent eval") 32 self.eval = eval 33 task = self.eval.parent_task() 34 if not task: 35 raise ValueError("Eval must have a parent task") 36 self.target_task = task 37 self.score_schema = BaseEval.build_score_schema(eval, allow_float_scores=True) 38 self.run_config = run_config 39 self.skills = skills 40 41 def model_and_provider(self) -> tuple[str, ModelProviderName]: 42 model_name = self.eval_config.model_name 43 provider = self.eval_config.model_provider 44 if ( 45 not model_name 46 or not provider 47 or not isinstance(model_name, str) 48 or not isinstance(provider, str) 49 or provider not in ModelProviderName.__members__ 50 ): 51 raise ValueError( 52 "Model name and provider must be set in the eval config model properties" 53 ) 54 55 return model_name, ModelProviderName(provider) 56 57 async def run_task_and_eval( 58 self, eval_job_item: TaskRun 59 ) -> tuple[TaskRun, EvalScores, Dict[str, str] | None]: 60 """ 61 Runs the task on the provided run_config to generate fresh output, then runs the eval on that output. 62 """ 63 input = eval_job_item.input 64 if self.run_config is None: 65 raise ValueError("Run config is required for run_task_and_eval") 66 67 run_adapter = adapter_for_task( 68 self.target_task, 69 self.run_config, 70 base_adapter_config=AdapterConfig( 71 allow_saving=False, 72 skills=self.skills, 73 ), 74 ) 75 76 # Parse structured input if needed 77 parsed_input = input 78 if self.target_task.input_json_schema is not None: 79 parsed_input = json.loads(input) 80 81 # we don't save by default here. We'll save manually after validating the output 82 run_output = await run_adapter.invoke(parsed_input) 83 84 eval_output, intermediate_outputs = await self.run_eval( 85 run_output, eval_job_item 86 ) 87 88 validate_schema_with_value_error( 89 eval_output, self.score_schema, "Eval output does not match score schema." 90 ) 91 92 return run_output, eval_output, intermediate_outputs 93 94 @abstractmethod 95 async def run_eval( 96 self, task_run: TaskRun, eval_job_item: TaskRun | None = None 97 ) -> tuple[EvalScores, Dict[str, str] | None]: 98 """ 99 Runs the eval on the given task run. 100 101 Returns a dictionary of scores which should conform to the score schema, and a dictionary of intermediate outputs (eval thinking). 102 """ 103 pass 104 105 @classmethod 106 def build_score_schema(cls, eval: Eval, allow_float_scores: bool = False) -> str: 107 """ 108 Build a JSON schema for the scoring output of the task requirements 109 110 We allow 2 modes: allow_float_scores=True and allow_float_scores=False. 111 112 allow_float_scores=False is used for the call to the model, and forces the model into selecting into discrete rating options (int 1-5, pass-fail, etc). 113 allow_float_scores=True is used for final score output (for example, after we take a g-eval weighting of the model's logprobs). A pass/fail rating might return 0.75 for likely pass (as opposed to 0.99 for near certain pass), or a 1-5 score might return 3.75. 114 """ 115 116 # Note: python maintains order, which is good as we want the user defined order, and overall last 117 properties = {} 118 for output_score in eval.output_scores: 119 output_score_json_key = output_score.json_key() 120 121 if len(output_score_json_key) == 0: 122 raise ValueError( 123 f"Invalid output score name: {output_score.name}. Can not be used as JSON schema key." 124 ) 125 property: dict[str, str | int | float | list[str] | list[int]] = { 126 "title": output_score.name, 127 } 128 129 match output_score.type: 130 case TaskOutputRatingType.five_star: 131 if allow_float_scores: 132 property["type"] = "number" 133 property["minimum"] = 1 134 property["maximum"] = 5 135 else: 136 property["type"] = "integer" 137 property["minimum"] = 1 138 property["maximum"] = 5 139 140 property["description"] = ( 141 f"{output_score.instruction}\n\nThe rating should be between 1 and 5, with 1 being the worst and 5 being the best." 142 ) 143 case TaskOutputRatingType.pass_fail: 144 if allow_float_scores: 145 property["type"] = "number" 146 property["minimum"] = 0 147 property["maximum"] = 1 148 property["description"] = ( 149 f"{output_score.instruction}\n\nThe rating should be between 0 and 1, with 0 being a failure and 1 being a pass." 150 ) 151 else: 152 property["enum"] = ["pass", "fail"] 153 property["type"] = "string" 154 property["description"] = ( 155 f"{output_score.instruction}\n\nThe rating should be either 'pass' or 'fail'." 156 ) 157 case TaskOutputRatingType.pass_fail_critical: 158 if allow_float_scores: 159 property["type"] = "number" 160 property["minimum"] = -1 161 property["maximum"] = 1 162 property["description"] = ( 163 f"{output_score.instruction}\n\nThe rating should be between -1 and 1, with 1 being a pass, 0 being a failure, and -1 being a critical failure (very severe failure)." 164 ) 165 else: 166 property["enum"] = ["pass", "fail", "critical"] 167 property["type"] = "string" 168 property["description"] = ( 169 f"{output_score.instruction}\n\nThe rating should be either 'pass', 'fail', or 'critical' where critical a very severe failure." 170 ) 171 case TaskOutputRatingType.custom: 172 # Skip custom rating types in evals 173 continue 174 case _: 175 raise_exhaustive_enum_error(output_score.type) 176 177 properties[output_score_json_key] = property 178 179 schema = { 180 "type": "object", 181 "properties": properties, 182 "required": list(properties.keys()), 183 "additionalProperties": False, 184 } 185 return json.dumps(schema, ensure_ascii=False)
Base class for all evals/evaluators.
Should be subclassed, and the run_eval method implemented.
22 def __init__( 23 self, 24 eval_config: EvalConfig, 25 run_config: RunConfigProperties | None, 26 skills: SkillsDict | None = None, 27 ): 28 self.eval_config = eval_config 29 eval = eval_config.parent_eval() 30 if not eval: 31 raise ValueError("Eval config must have a parent eval") 32 self.eval = eval 33 task = self.eval.parent_task() 34 if not task: 35 raise ValueError("Eval must have a parent task") 36 self.target_task = task 37 self.score_schema = BaseEval.build_score_schema(eval, allow_float_scores=True) 38 self.run_config = run_config 39 self.skills = skills
41 def model_and_provider(self) -> tuple[str, ModelProviderName]: 42 model_name = self.eval_config.model_name 43 provider = self.eval_config.model_provider 44 if ( 45 not model_name 46 or not provider 47 or not isinstance(model_name, str) 48 or not isinstance(provider, str) 49 or provider not in ModelProviderName.__members__ 50 ): 51 raise ValueError( 52 "Model name and provider must be set in the eval config model properties" 53 ) 54 55 return model_name, ModelProviderName(provider)
57 async def run_task_and_eval( 58 self, eval_job_item: TaskRun 59 ) -> tuple[TaskRun, EvalScores, Dict[str, str] | None]: 60 """ 61 Runs the task on the provided run_config to generate fresh output, then runs the eval on that output. 62 """ 63 input = eval_job_item.input 64 if self.run_config is None: 65 raise ValueError("Run config is required for run_task_and_eval") 66 67 run_adapter = adapter_for_task( 68 self.target_task, 69 self.run_config, 70 base_adapter_config=AdapterConfig( 71 allow_saving=False, 72 skills=self.skills, 73 ), 74 ) 75 76 # Parse structured input if needed 77 parsed_input = input 78 if self.target_task.input_json_schema is not None: 79 parsed_input = json.loads(input) 80 81 # we don't save by default here. We'll save manually after validating the output 82 run_output = await run_adapter.invoke(parsed_input) 83 84 eval_output, intermediate_outputs = await self.run_eval( 85 run_output, eval_job_item 86 ) 87 88 validate_schema_with_value_error( 89 eval_output, self.score_schema, "Eval output does not match score schema." 90 ) 91 92 return run_output, eval_output, intermediate_outputs
Runs the task on the provided run_config to generate fresh output, then runs the eval on that output.
94 @abstractmethod 95 async def run_eval( 96 self, task_run: TaskRun, eval_job_item: TaskRun | None = None 97 ) -> tuple[EvalScores, Dict[str, str] | None]: 98 """ 99 Runs the eval on the given task run. 100 101 Returns a dictionary of scores which should conform to the score schema, and a dictionary of intermediate outputs (eval thinking). 102 """ 103 pass
Runs the eval on the given task run.
Returns a dictionary of scores which should conform to the score schema, and a dictionary of intermediate outputs (eval thinking).
105 @classmethod 106 def build_score_schema(cls, eval: Eval, allow_float_scores: bool = False) -> str: 107 """ 108 Build a JSON schema for the scoring output of the task requirements 109 110 We allow 2 modes: allow_float_scores=True and allow_float_scores=False. 111 112 allow_float_scores=False is used for the call to the model, and forces the model into selecting into discrete rating options (int 1-5, pass-fail, etc). 113 allow_float_scores=True is used for final score output (for example, after we take a g-eval weighting of the model's logprobs). A pass/fail rating might return 0.75 for likely pass (as opposed to 0.99 for near certain pass), or a 1-5 score might return 3.75. 114 """ 115 116 # Note: python maintains order, which is good as we want the user defined order, and overall last 117 properties = {} 118 for output_score in eval.output_scores: 119 output_score_json_key = output_score.json_key() 120 121 if len(output_score_json_key) == 0: 122 raise ValueError( 123 f"Invalid output score name: {output_score.name}. Can not be used as JSON schema key." 124 ) 125 property: dict[str, str | int | float | list[str] | list[int]] = { 126 "title": output_score.name, 127 } 128 129 match output_score.type: 130 case TaskOutputRatingType.five_star: 131 if allow_float_scores: 132 property["type"] = "number" 133 property["minimum"] = 1 134 property["maximum"] = 5 135 else: 136 property["type"] = "integer" 137 property["minimum"] = 1 138 property["maximum"] = 5 139 140 property["description"] = ( 141 f"{output_score.instruction}\n\nThe rating should be between 1 and 5, with 1 being the worst and 5 being the best." 142 ) 143 case TaskOutputRatingType.pass_fail: 144 if allow_float_scores: 145 property["type"] = "number" 146 property["minimum"] = 0 147 property["maximum"] = 1 148 property["description"] = ( 149 f"{output_score.instruction}\n\nThe rating should be between 0 and 1, with 0 being a failure and 1 being a pass." 150 ) 151 else: 152 property["enum"] = ["pass", "fail"] 153 property["type"] = "string" 154 property["description"] = ( 155 f"{output_score.instruction}\n\nThe rating should be either 'pass' or 'fail'." 156 ) 157 case TaskOutputRatingType.pass_fail_critical: 158 if allow_float_scores: 159 property["type"] = "number" 160 property["minimum"] = -1 161 property["maximum"] = 1 162 property["description"] = ( 163 f"{output_score.instruction}\n\nThe rating should be between -1 and 1, with 1 being a pass, 0 being a failure, and -1 being a critical failure (very severe failure)." 164 ) 165 else: 166 property["enum"] = ["pass", "fail", "critical"] 167 property["type"] = "string" 168 property["description"] = ( 169 f"{output_score.instruction}\n\nThe rating should be either 'pass', 'fail', or 'critical' where critical a very severe failure." 170 ) 171 case TaskOutputRatingType.custom: 172 # Skip custom rating types in evals 173 continue 174 case _: 175 raise_exhaustive_enum_error(output_score.type) 176 177 properties[output_score_json_key] = property 178 179 schema = { 180 "type": "object", 181 "properties": properties, 182 "required": list(properties.keys()), 183 "additionalProperties": False, 184 } 185 return json.dumps(schema, ensure_ascii=False)
Build a JSON schema for the scoring output of the task requirements
We allow 2 modes: allow_float_scores=True and allow_float_scores=False.
allow_float_scores=False is used for the call to the model, and forces the model into selecting into discrete rating options (int 1-5, pass-fail, etc). allow_float_scores=True is used for final score output (for example, after we take a g-eval weighting of the model's logprobs). A pass/fail rating might return 0.75 for likely pass (as opposed to 0.99 for near certain pass), or a 1-5 score might return 3.75.