kiln_ai.adapters.eval.base_eval
1import json 2from abc import abstractmethod 3from typing import Dict 4 5from kiln_ai.adapters.adapter_registry import adapter_for_task 6from kiln_ai.adapters.ml_model_list import ModelProviderName 7from kiln_ai.adapters.model_adapters.base_adapter import AdapterConfig 8from kiln_ai.datamodel.eval import Eval, EvalConfig, EvalScores 9from kiln_ai.datamodel.json_schema import validate_schema_with_value_error 10from kiln_ai.datamodel.task import RunConfigProperties, TaskOutputRatingType, TaskRun 11from kiln_ai.utils.exhaustive_error import raise_exhaustive_enum_error 12 13 14class BaseEval: 15 """ 16 Base class for all evals/evaluators. 17 18 Should be subclassed, and the run_eval method implemented. 19 """ 20 21 def __init__(self, eval_config: EvalConfig, run_config: RunConfigProperties | None): 22 self.eval_config = eval_config 23 eval = eval_config.parent_eval() 24 if not eval: 25 raise ValueError("Eval config must have a parent eval") 26 self.eval = eval 27 task = self.eval.parent_task() 28 if not task: 29 raise ValueError("Eval must have a parent task") 30 self.target_task = task 31 self.score_schema = BaseEval.build_score_schema(eval, allow_float_scores=True) 32 self.run_config = run_config 33 34 def model_and_provider(self) -> tuple[str, ModelProviderName]: 35 model_name = self.eval_config.model_name 36 provider = self.eval_config.model_provider 37 if ( 38 not model_name 39 or not provider 40 or not isinstance(model_name, str) 41 or not isinstance(provider, str) 42 or provider not in ModelProviderName.__members__ 43 ): 44 raise ValueError( 45 "Model name and provider must be set in the eval config model properties" 46 ) 47 48 return model_name, ModelProviderName(provider) 49 50 async def run_task_and_eval( 51 self, eval_job_item: TaskRun 52 ) -> tuple[TaskRun, EvalScores, Dict[str, str] | None]: 53 """ 54 Runs the task on the provided run_config to generate fresh output, then runs the eval on that output. 55 """ 56 input = eval_job_item.input 57 if self.run_config is None: 58 raise ValueError("Run config is required for run_task_and_eval") 59 60 run_adapter = adapter_for_task( 61 self.target_task, 62 self.run_config, 63 base_adapter_config=AdapterConfig(allow_saving=False), 64 ) 65 66 # Parse structured input if needed 67 parsed_input = input 68 if self.target_task.input_json_schema is not None: 69 parsed_input = json.loads(input) 70 71 # we don't save by default here. We'll save manually after validating the output 72 run_output = await run_adapter.invoke(parsed_input) 73 74 eval_output, intermediate_outputs = await self.run_eval( 75 run_output, eval_job_item 76 ) 77 78 validate_schema_with_value_error( 79 eval_output, self.score_schema, "Eval output does not match score schema." 80 ) 81 82 return run_output, eval_output, intermediate_outputs 83 84 @abstractmethod 85 async def run_eval( 86 self, task_run: TaskRun, eval_job_item: TaskRun | None = None 87 ) -> tuple[EvalScores, Dict[str, str] | None]: 88 """ 89 Runs the eval on the given task run. 90 91 Returns a dictionary of scores which should conform to the score schema, and a dictionary of intermediate outputs (eval thinking). 92 """ 93 pass 94 95 @classmethod 96 def build_score_schema(cls, eval: Eval, allow_float_scores: bool = False) -> str: 97 """ 98 Build a JSON schema for the scoring output of the task requirements 99 100 We allow 2 modes: allow_float_scores=True and allow_float_scores=False. 101 102 allow_float_scores=False is used for the call to the model, and forces the model into selecting into discrete rating options (int 1-5, pass-fail, etc). 103 allow_float_scores=True is used for final score output (for example, after we take a g-eval weighting of the model's logprobs). A pass/fail rating might return 0.75 for likely pass (as opposed to 0.99 for near certain pass), or a 1-5 score might return 3.75. 104 """ 105 106 # Note: python maintains order, which is good as we want the user defined order, and overall last 107 properties = {} 108 for output_score in eval.output_scores: 109 output_score_json_key = output_score.json_key() 110 111 if len(output_score_json_key) == 0: 112 raise ValueError( 113 f"Invalid output score name: {output_score.name}. Can not be used as JSON schema key." 114 ) 115 property: dict[str, str | int | float | list[str] | list[int]] = { 116 "title": output_score.name, 117 } 118 match output_score.type: 119 case TaskOutputRatingType.five_star: 120 if allow_float_scores: 121 property["type"] = "number" 122 property["minimum"] = 1 123 property["maximum"] = 5 124 else: 125 property["type"] = "integer" 126 property["minimum"] = 1 127 property["maximum"] = 5 128 129 property["description"] = ( 130 f"{output_score.instruction}\n\nThe rating should be between 1 and 5, with 1 being the worst and 5 being the best." 131 ) 132 case TaskOutputRatingType.pass_fail: 133 if allow_float_scores: 134 property["type"] = "number" 135 property["minimum"] = 0 136 property["maximum"] = 1 137 property["description"] = ( 138 f"{output_score.instruction}\n\nThe rating should be between 0 and 1, with 0 being a failure and 1 being a pass." 139 ) 140 else: 141 property["enum"] = ["pass", "fail"] 142 property["type"] = "string" 143 property["description"] = ( 144 f"{output_score.instruction}\n\nThe rating should be either 'pass' or 'fail'." 145 ) 146 case TaskOutputRatingType.pass_fail_critical: 147 if allow_float_scores: 148 property["type"] = "number" 149 property["minimum"] = -1 150 property["maximum"] = 1 151 property["description"] = ( 152 f"{output_score.instruction}\n\nThe rating should be between -1 and 1, with 1 being a pass, 0 being a failure, and -1 being a critical failure (very severe failure)." 153 ) 154 else: 155 property["enum"] = ["pass", "fail", "critical"] 156 property["type"] = "string" 157 property["description"] = ( 158 f"{output_score.instruction}\n\nThe rating should be either 'pass', 'fail', or 'critical' where critical a very severe failure." 159 ) 160 case TaskOutputRatingType.custom: 161 # Skip custom rating types in evals 162 continue 163 case _: 164 raise_exhaustive_enum_error(output_score.type) 165 166 properties[output_score_json_key] = property 167 168 schema = { 169 "type": "object", 170 "properties": properties, 171 "required": list(properties.keys()), 172 } 173 return json.dumps(schema, ensure_ascii=False)
15class BaseEval: 16 """ 17 Base class for all evals/evaluators. 18 19 Should be subclassed, and the run_eval method implemented. 20 """ 21 22 def __init__(self, eval_config: EvalConfig, run_config: RunConfigProperties | None): 23 self.eval_config = eval_config 24 eval = eval_config.parent_eval() 25 if not eval: 26 raise ValueError("Eval config must have a parent eval") 27 self.eval = eval 28 task = self.eval.parent_task() 29 if not task: 30 raise ValueError("Eval must have a parent task") 31 self.target_task = task 32 self.score_schema = BaseEval.build_score_schema(eval, allow_float_scores=True) 33 self.run_config = run_config 34 35 def model_and_provider(self) -> tuple[str, ModelProviderName]: 36 model_name = self.eval_config.model_name 37 provider = self.eval_config.model_provider 38 if ( 39 not model_name 40 or not provider 41 or not isinstance(model_name, str) 42 or not isinstance(provider, str) 43 or provider not in ModelProviderName.__members__ 44 ): 45 raise ValueError( 46 "Model name and provider must be set in the eval config model properties" 47 ) 48 49 return model_name, ModelProviderName(provider) 50 51 async def run_task_and_eval( 52 self, eval_job_item: TaskRun 53 ) -> tuple[TaskRun, EvalScores, Dict[str, str] | None]: 54 """ 55 Runs the task on the provided run_config to generate fresh output, then runs the eval on that output. 56 """ 57 input = eval_job_item.input 58 if self.run_config is None: 59 raise ValueError("Run config is required for run_task_and_eval") 60 61 run_adapter = adapter_for_task( 62 self.target_task, 63 self.run_config, 64 base_adapter_config=AdapterConfig(allow_saving=False), 65 ) 66 67 # Parse structured input if needed 68 parsed_input = input 69 if self.target_task.input_json_schema is not None: 70 parsed_input = json.loads(input) 71 72 # we don't save by default here. We'll save manually after validating the output 73 run_output = await run_adapter.invoke(parsed_input) 74 75 eval_output, intermediate_outputs = await self.run_eval( 76 run_output, eval_job_item 77 ) 78 79 validate_schema_with_value_error( 80 eval_output, self.score_schema, "Eval output does not match score schema." 81 ) 82 83 return run_output, eval_output, intermediate_outputs 84 85 @abstractmethod 86 async def run_eval( 87 self, task_run: TaskRun, eval_job_item: TaskRun | None = None 88 ) -> tuple[EvalScores, Dict[str, str] | None]: 89 """ 90 Runs the eval on the given task run. 91 92 Returns a dictionary of scores which should conform to the score schema, and a dictionary of intermediate outputs (eval thinking). 93 """ 94 pass 95 96 @classmethod 97 def build_score_schema(cls, eval: Eval, allow_float_scores: bool = False) -> str: 98 """ 99 Build a JSON schema for the scoring output of the task requirements 100 101 We allow 2 modes: allow_float_scores=True and allow_float_scores=False. 102 103 allow_float_scores=False is used for the call to the model, and forces the model into selecting into discrete rating options (int 1-5, pass-fail, etc). 104 allow_float_scores=True is used for final score output (for example, after we take a g-eval weighting of the model's logprobs). A pass/fail rating might return 0.75 for likely pass (as opposed to 0.99 for near certain pass), or a 1-5 score might return 3.75. 105 """ 106 107 # Note: python maintains order, which is good as we want the user defined order, and overall last 108 properties = {} 109 for output_score in eval.output_scores: 110 output_score_json_key = output_score.json_key() 111 112 if len(output_score_json_key) == 0: 113 raise ValueError( 114 f"Invalid output score name: {output_score.name}. Can not be used as JSON schema key." 115 ) 116 property: dict[str, str | int | float | list[str] | list[int]] = { 117 "title": output_score.name, 118 } 119 match output_score.type: 120 case TaskOutputRatingType.five_star: 121 if allow_float_scores: 122 property["type"] = "number" 123 property["minimum"] = 1 124 property["maximum"] = 5 125 else: 126 property["type"] = "integer" 127 property["minimum"] = 1 128 property["maximum"] = 5 129 130 property["description"] = ( 131 f"{output_score.instruction}\n\nThe rating should be between 1 and 5, with 1 being the worst and 5 being the best." 132 ) 133 case TaskOutputRatingType.pass_fail: 134 if allow_float_scores: 135 property["type"] = "number" 136 property["minimum"] = 0 137 property["maximum"] = 1 138 property["description"] = ( 139 f"{output_score.instruction}\n\nThe rating should be between 0 and 1, with 0 being a failure and 1 being a pass." 140 ) 141 else: 142 property["enum"] = ["pass", "fail"] 143 property["type"] = "string" 144 property["description"] = ( 145 f"{output_score.instruction}\n\nThe rating should be either 'pass' or 'fail'." 146 ) 147 case TaskOutputRatingType.pass_fail_critical: 148 if allow_float_scores: 149 property["type"] = "number" 150 property["minimum"] = -1 151 property["maximum"] = 1 152 property["description"] = ( 153 f"{output_score.instruction}\n\nThe rating should be between -1 and 1, with 1 being a pass, 0 being a failure, and -1 being a critical failure (very severe failure)." 154 ) 155 else: 156 property["enum"] = ["pass", "fail", "critical"] 157 property["type"] = "string" 158 property["description"] = ( 159 f"{output_score.instruction}\n\nThe rating should be either 'pass', 'fail', or 'critical' where critical a very severe failure." 160 ) 161 case TaskOutputRatingType.custom: 162 # Skip custom rating types in evals 163 continue 164 case _: 165 raise_exhaustive_enum_error(output_score.type) 166 167 properties[output_score_json_key] = property 168 169 schema = { 170 "type": "object", 171 "properties": properties, 172 "required": list(properties.keys()), 173 } 174 return json.dumps(schema, ensure_ascii=False)
Base class for all evals/evaluators.
Should be subclassed, and the run_eval method implemented.
22 def __init__(self, eval_config: EvalConfig, run_config: RunConfigProperties | None): 23 self.eval_config = eval_config 24 eval = eval_config.parent_eval() 25 if not eval: 26 raise ValueError("Eval config must have a parent eval") 27 self.eval = eval 28 task = self.eval.parent_task() 29 if not task: 30 raise ValueError("Eval must have a parent task") 31 self.target_task = task 32 self.score_schema = BaseEval.build_score_schema(eval, allow_float_scores=True) 33 self.run_config = run_config
35 def model_and_provider(self) -> tuple[str, ModelProviderName]: 36 model_name = self.eval_config.model_name 37 provider = self.eval_config.model_provider 38 if ( 39 not model_name 40 or not provider 41 or not isinstance(model_name, str) 42 or not isinstance(provider, str) 43 or provider not in ModelProviderName.__members__ 44 ): 45 raise ValueError( 46 "Model name and provider must be set in the eval config model properties" 47 ) 48 49 return model_name, ModelProviderName(provider)
51 async def run_task_and_eval( 52 self, eval_job_item: TaskRun 53 ) -> tuple[TaskRun, EvalScores, Dict[str, str] | None]: 54 """ 55 Runs the task on the provided run_config to generate fresh output, then runs the eval on that output. 56 """ 57 input = eval_job_item.input 58 if self.run_config is None: 59 raise ValueError("Run config is required for run_task_and_eval") 60 61 run_adapter = adapter_for_task( 62 self.target_task, 63 self.run_config, 64 base_adapter_config=AdapterConfig(allow_saving=False), 65 ) 66 67 # Parse structured input if needed 68 parsed_input = input 69 if self.target_task.input_json_schema is not None: 70 parsed_input = json.loads(input) 71 72 # we don't save by default here. We'll save manually after validating the output 73 run_output = await run_adapter.invoke(parsed_input) 74 75 eval_output, intermediate_outputs = await self.run_eval( 76 run_output, eval_job_item 77 ) 78 79 validate_schema_with_value_error( 80 eval_output, self.score_schema, "Eval output does not match score schema." 81 ) 82 83 return run_output, eval_output, intermediate_outputs
Runs the task on the provided run_config to generate fresh output, then runs the eval on that output.
85 @abstractmethod 86 async def run_eval( 87 self, task_run: TaskRun, eval_job_item: TaskRun | None = None 88 ) -> tuple[EvalScores, Dict[str, str] | None]: 89 """ 90 Runs the eval on the given task run. 91 92 Returns a dictionary of scores which should conform to the score schema, and a dictionary of intermediate outputs (eval thinking). 93 """ 94 pass
Runs the eval on the given task run.
Returns a dictionary of scores which should conform to the score schema, and a dictionary of intermediate outputs (eval thinking).
96 @classmethod 97 def build_score_schema(cls, eval: Eval, allow_float_scores: bool = False) -> str: 98 """ 99 Build a JSON schema for the scoring output of the task requirements 100 101 We allow 2 modes: allow_float_scores=True and allow_float_scores=False. 102 103 allow_float_scores=False is used for the call to the model, and forces the model into selecting into discrete rating options (int 1-5, pass-fail, etc). 104 allow_float_scores=True is used for final score output (for example, after we take a g-eval weighting of the model's logprobs). A pass/fail rating might return 0.75 for likely pass (as opposed to 0.99 for near certain pass), or a 1-5 score might return 3.75. 105 """ 106 107 # Note: python maintains order, which is good as we want the user defined order, and overall last 108 properties = {} 109 for output_score in eval.output_scores: 110 output_score_json_key = output_score.json_key() 111 112 if len(output_score_json_key) == 0: 113 raise ValueError( 114 f"Invalid output score name: {output_score.name}. Can not be used as JSON schema key." 115 ) 116 property: dict[str, str | int | float | list[str] | list[int]] = { 117 "title": output_score.name, 118 } 119 match output_score.type: 120 case TaskOutputRatingType.five_star: 121 if allow_float_scores: 122 property["type"] = "number" 123 property["minimum"] = 1 124 property["maximum"] = 5 125 else: 126 property["type"] = "integer" 127 property["minimum"] = 1 128 property["maximum"] = 5 129 130 property["description"] = ( 131 f"{output_score.instruction}\n\nThe rating should be between 1 and 5, with 1 being the worst and 5 being the best." 132 ) 133 case TaskOutputRatingType.pass_fail: 134 if allow_float_scores: 135 property["type"] = "number" 136 property["minimum"] = 0 137 property["maximum"] = 1 138 property["description"] = ( 139 f"{output_score.instruction}\n\nThe rating should be between 0 and 1, with 0 being a failure and 1 being a pass." 140 ) 141 else: 142 property["enum"] = ["pass", "fail"] 143 property["type"] = "string" 144 property["description"] = ( 145 f"{output_score.instruction}\n\nThe rating should be either 'pass' or 'fail'." 146 ) 147 case TaskOutputRatingType.pass_fail_critical: 148 if allow_float_scores: 149 property["type"] = "number" 150 property["minimum"] = -1 151 property["maximum"] = 1 152 property["description"] = ( 153 f"{output_score.instruction}\n\nThe rating should be between -1 and 1, with 1 being a pass, 0 being a failure, and -1 being a critical failure (very severe failure)." 154 ) 155 else: 156 property["enum"] = ["pass", "fail", "critical"] 157 property["type"] = "string" 158 property["description"] = ( 159 f"{output_score.instruction}\n\nThe rating should be either 'pass', 'fail', or 'critical' where critical a very severe failure." 160 ) 161 case TaskOutputRatingType.custom: 162 # Skip custom rating types in evals 163 continue 164 case _: 165 raise_exhaustive_enum_error(output_score.type) 166 167 properties[output_score_json_key] = property 168 169 schema = { 170 "type": "object", 171 "properties": properties, 172 "required": list(properties.keys()), 173 } 174 return json.dumps(schema, ensure_ascii=False)
Build a JSON schema for the scoring output of the task requirements
We allow 2 modes: allow_float_scores=True and allow_float_scores=False.
allow_float_scores=False is used for the call to the model, and forces the model into selecting into discrete rating options (int 1-5, pass-fail, etc). allow_float_scores=True is used for final score output (for example, after we take a g-eval weighting of the model's logprobs). A pass/fail rating might return 0.75 for likely pass (as opposed to 0.99 for near certain pass), or a 1-5 score might return 3.75.