kiln_ai.adapters.eval.base_eval

  1import json
  2from abc import abstractmethod
  3from typing import Dict
  4
  5from kiln_ai.adapters.adapter_registry import adapter_for_task
  6from kiln_ai.adapters.ml_model_list import ModelProviderName
  7from kiln_ai.adapters.model_adapters.base_adapter import AdapterConfig, SkillsDict
  8from kiln_ai.datamodel.eval import Eval, EvalConfig, EvalScores
  9from kiln_ai.datamodel.json_schema import validate_schema_with_value_error
 10from kiln_ai.datamodel.task import RunConfigProperties, TaskOutputRatingType, TaskRun
 11from kiln_ai.utils.exhaustive_error import raise_exhaustive_enum_error
 12
 13
 14class BaseEval:
 15    """
 16    Base class for all evals/evaluators.
 17
 18    Should be subclassed, and the run_eval method implemented.
 19    """
 20
 21    def __init__(
 22        self,
 23        eval_config: EvalConfig,
 24        run_config: RunConfigProperties | None,
 25        skills: SkillsDict | None = None,
 26    ):
 27        self.eval_config = eval_config
 28        eval = eval_config.parent_eval()
 29        if not eval:
 30            raise ValueError("Eval config must have a parent eval")
 31        self.eval = eval
 32        task = self.eval.parent_task()
 33        if not task:
 34            raise ValueError("Eval must have a parent task")
 35        self.target_task = task
 36        self.score_schema = BaseEval.build_score_schema(eval, allow_float_scores=True)
 37        self.run_config = run_config
 38        self.skills = skills
 39
 40    def model_and_provider(self) -> tuple[str, ModelProviderName]:
 41        model_name = self.eval_config.model_name
 42        provider = self.eval_config.model_provider
 43        if (
 44            not model_name
 45            or not provider
 46            or not isinstance(model_name, str)
 47            or not isinstance(provider, str)
 48            or provider not in ModelProviderName.__members__
 49        ):
 50            raise ValueError(
 51                "Model name and provider must be set in the eval config model properties"
 52            )
 53
 54        return model_name, ModelProviderName(provider)
 55
 56    async def run_task_and_eval(
 57        self, eval_job_item: TaskRun
 58    ) -> tuple[TaskRun, EvalScores, Dict[str, str] | None]:
 59        """
 60        Runs the task on the provided run_config to generate fresh output, then runs the eval on that output.
 61        """
 62        input = eval_job_item.input
 63        if self.run_config is None:
 64            raise ValueError("Run config is required for run_task_and_eval")
 65
 66        run_adapter = adapter_for_task(
 67            self.target_task,
 68            self.run_config,
 69            base_adapter_config=AdapterConfig(
 70                allow_saving=False,
 71                skills=self.skills,
 72            ),
 73        )
 74
 75        # Parse structured input if needed
 76        parsed_input = input
 77        if self.target_task.input_json_schema is not None:
 78            parsed_input = json.loads(input)
 79
 80        # we don't save by default here. We'll save manually after validating the output
 81        run_output = await run_adapter.invoke(parsed_input)
 82
 83        eval_output, intermediate_outputs = await self.run_eval(
 84            run_output, eval_job_item
 85        )
 86
 87        validate_schema_with_value_error(
 88            eval_output, self.score_schema, "Eval output does not match score schema."
 89        )
 90
 91        return run_output, eval_output, intermediate_outputs
 92
 93    @abstractmethod
 94    async def run_eval(
 95        self, task_run: TaskRun, eval_job_item: TaskRun | None = None
 96    ) -> tuple[EvalScores, Dict[str, str] | None]:
 97        """
 98        Runs the eval on the given task run.
 99
100        Returns a dictionary of scores which should conform to the score schema, and a dictionary of intermediate outputs (eval thinking).
101        """
102        pass
103
104    @classmethod
105    def build_score_schema(cls, eval: Eval, allow_float_scores: bool = False) -> str:
106        """
107        Build a JSON schema for the scoring output of the task requirements
108
109        We allow 2 modes: allow_float_scores=True and allow_float_scores=False.
110
111        allow_float_scores=False is used for the call to the model, and forces the model into selecting into discrete rating options (int 1-5, pass-fail, etc).
112        allow_float_scores=True is used for final score output (for example, after we take a g-eval weighting of the model's logprobs). A pass/fail rating might return 0.75 for likely pass (as opposed to 0.99 for near certain pass), or a 1-5 score might return 3.75.
113        """
114
115        # Note: python maintains order, which is good as we want the user defined order, and overall last
116        properties = {}
117        for output_score in eval.output_scores:
118            output_score_json_key = output_score.json_key()
119
120            if len(output_score_json_key) == 0:
121                raise ValueError(
122                    f"Invalid output score name: {output_score.name}. Can not be used as JSON schema key."
123                )
124            property: dict[str, str | int | float | list[str] | list[int]] = {
125                "title": output_score.name,
126            }
127
128            match output_score.type:
129                case TaskOutputRatingType.five_star:
130                    if allow_float_scores:
131                        property["type"] = "number"
132                        property["minimum"] = 1
133                        property["maximum"] = 5
134                    else:
135                        property["type"] = "integer"
136                        property["minimum"] = 1
137                        property["maximum"] = 5
138
139                    property["description"] = (
140                        f"{output_score.instruction}\n\nThe rating should be between 1 and 5, with 1 being the worst and 5 being the best."
141                    )
142                case TaskOutputRatingType.pass_fail:
143                    if allow_float_scores:
144                        property["type"] = "number"
145                        property["minimum"] = 0
146                        property["maximum"] = 1
147                        property["description"] = (
148                            f"{output_score.instruction}\n\nThe rating should be between 0 and 1, with 0 being a failure and 1 being a pass."
149                        )
150                    else:
151                        property["enum"] = ["pass", "fail"]
152                        property["type"] = "string"
153                        property["description"] = (
154                            f"{output_score.instruction}\n\nThe rating should be either 'pass' or 'fail'."
155                        )
156                case TaskOutputRatingType.pass_fail_critical:
157                    if allow_float_scores:
158                        property["type"] = "number"
159                        property["minimum"] = -1
160                        property["maximum"] = 1
161                        property["description"] = (
162                            f"{output_score.instruction}\n\nThe rating should be between -1 and 1, with 1 being a pass, 0 being a failure, and -1 being a critical failure (very severe failure)."
163                        )
164                    else:
165                        property["enum"] = ["pass", "fail", "critical"]
166                        property["type"] = "string"
167                        property["description"] = (
168                            f"{output_score.instruction}\n\nThe rating should be either 'pass', 'fail', or 'critical' where critical a very severe failure."
169                        )
170                case TaskOutputRatingType.custom:
171                    # Skip custom rating types in evals
172                    continue
173                case _:
174                    raise_exhaustive_enum_error(output_score.type)
175
176            properties[output_score_json_key] = property
177
178        schema = {
179            "type": "object",
180            "properties": properties,
181            "required": list(properties.keys()),
182            "additionalProperties": False,
183        }
184        return json.dumps(schema, ensure_ascii=False)
class BaseEval:
 15class BaseEval:
 16    """
 17    Base class for all evals/evaluators.
 18
 19    Should be subclassed, and the run_eval method implemented.
 20    """
 21
 22    def __init__(
 23        self,
 24        eval_config: EvalConfig,
 25        run_config: RunConfigProperties | None,
 26        skills: SkillsDict | None = None,
 27    ):
 28        self.eval_config = eval_config
 29        eval = eval_config.parent_eval()
 30        if not eval:
 31            raise ValueError("Eval config must have a parent eval")
 32        self.eval = eval
 33        task = self.eval.parent_task()
 34        if not task:
 35            raise ValueError("Eval must have a parent task")
 36        self.target_task = task
 37        self.score_schema = BaseEval.build_score_schema(eval, allow_float_scores=True)
 38        self.run_config = run_config
 39        self.skills = skills
 40
 41    def model_and_provider(self) -> tuple[str, ModelProviderName]:
 42        model_name = self.eval_config.model_name
 43        provider = self.eval_config.model_provider
 44        if (
 45            not model_name
 46            or not provider
 47            or not isinstance(model_name, str)
 48            or not isinstance(provider, str)
 49            or provider not in ModelProviderName.__members__
 50        ):
 51            raise ValueError(
 52                "Model name and provider must be set in the eval config model properties"
 53            )
 54
 55        return model_name, ModelProviderName(provider)
 56
 57    async def run_task_and_eval(
 58        self, eval_job_item: TaskRun
 59    ) -> tuple[TaskRun, EvalScores, Dict[str, str] | None]:
 60        """
 61        Runs the task on the provided run_config to generate fresh output, then runs the eval on that output.
 62        """
 63        input = eval_job_item.input
 64        if self.run_config is None:
 65            raise ValueError("Run config is required for run_task_and_eval")
 66
 67        run_adapter = adapter_for_task(
 68            self.target_task,
 69            self.run_config,
 70            base_adapter_config=AdapterConfig(
 71                allow_saving=False,
 72                skills=self.skills,
 73            ),
 74        )
 75
 76        # Parse structured input if needed
 77        parsed_input = input
 78        if self.target_task.input_json_schema is not None:
 79            parsed_input = json.loads(input)
 80
 81        # we don't save by default here. We'll save manually after validating the output
 82        run_output = await run_adapter.invoke(parsed_input)
 83
 84        eval_output, intermediate_outputs = await self.run_eval(
 85            run_output, eval_job_item
 86        )
 87
 88        validate_schema_with_value_error(
 89            eval_output, self.score_schema, "Eval output does not match score schema."
 90        )
 91
 92        return run_output, eval_output, intermediate_outputs
 93
 94    @abstractmethod
 95    async def run_eval(
 96        self, task_run: TaskRun, eval_job_item: TaskRun | None = None
 97    ) -> tuple[EvalScores, Dict[str, str] | None]:
 98        """
 99        Runs the eval on the given task run.
100
101        Returns a dictionary of scores which should conform to the score schema, and a dictionary of intermediate outputs (eval thinking).
102        """
103        pass
104
105    @classmethod
106    def build_score_schema(cls, eval: Eval, allow_float_scores: bool = False) -> str:
107        """
108        Build a JSON schema for the scoring output of the task requirements
109
110        We allow 2 modes: allow_float_scores=True and allow_float_scores=False.
111
112        allow_float_scores=False is used for the call to the model, and forces the model into selecting into discrete rating options (int 1-5, pass-fail, etc).
113        allow_float_scores=True is used for final score output (for example, after we take a g-eval weighting of the model's logprobs). A pass/fail rating might return 0.75 for likely pass (as opposed to 0.99 for near certain pass), or a 1-5 score might return 3.75.
114        """
115
116        # Note: python maintains order, which is good as we want the user defined order, and overall last
117        properties = {}
118        for output_score in eval.output_scores:
119            output_score_json_key = output_score.json_key()
120
121            if len(output_score_json_key) == 0:
122                raise ValueError(
123                    f"Invalid output score name: {output_score.name}. Can not be used as JSON schema key."
124                )
125            property: dict[str, str | int | float | list[str] | list[int]] = {
126                "title": output_score.name,
127            }
128
129            match output_score.type:
130                case TaskOutputRatingType.five_star:
131                    if allow_float_scores:
132                        property["type"] = "number"
133                        property["minimum"] = 1
134                        property["maximum"] = 5
135                    else:
136                        property["type"] = "integer"
137                        property["minimum"] = 1
138                        property["maximum"] = 5
139
140                    property["description"] = (
141                        f"{output_score.instruction}\n\nThe rating should be between 1 and 5, with 1 being the worst and 5 being the best."
142                    )
143                case TaskOutputRatingType.pass_fail:
144                    if allow_float_scores:
145                        property["type"] = "number"
146                        property["minimum"] = 0
147                        property["maximum"] = 1
148                        property["description"] = (
149                            f"{output_score.instruction}\n\nThe rating should be between 0 and 1, with 0 being a failure and 1 being a pass."
150                        )
151                    else:
152                        property["enum"] = ["pass", "fail"]
153                        property["type"] = "string"
154                        property["description"] = (
155                            f"{output_score.instruction}\n\nThe rating should be either 'pass' or 'fail'."
156                        )
157                case TaskOutputRatingType.pass_fail_critical:
158                    if allow_float_scores:
159                        property["type"] = "number"
160                        property["minimum"] = -1
161                        property["maximum"] = 1
162                        property["description"] = (
163                            f"{output_score.instruction}\n\nThe rating should be between -1 and 1, with 1 being a pass, 0 being a failure, and -1 being a critical failure (very severe failure)."
164                        )
165                    else:
166                        property["enum"] = ["pass", "fail", "critical"]
167                        property["type"] = "string"
168                        property["description"] = (
169                            f"{output_score.instruction}\n\nThe rating should be either 'pass', 'fail', or 'critical' where critical a very severe failure."
170                        )
171                case TaskOutputRatingType.custom:
172                    # Skip custom rating types in evals
173                    continue
174                case _:
175                    raise_exhaustive_enum_error(output_score.type)
176
177            properties[output_score_json_key] = property
178
179        schema = {
180            "type": "object",
181            "properties": properties,
182            "required": list(properties.keys()),
183            "additionalProperties": False,
184        }
185        return json.dumps(schema, ensure_ascii=False)

Base class for all evals/evaluators.

Should be subclassed, and the run_eval method implemented.

BaseEval( eval_config: kiln_ai.datamodel.eval.EvalConfig, run_config: Optional[Annotated[Union[Annotated[kiln_ai.datamodel.run_config.KilnAgentRunConfigProperties, Tag(tag='kiln_agent')], Annotated[kiln_ai.datamodel.run_config.McpRunConfigProperties, Tag(tag='mcp')]], Discriminator(discriminator=<function _get_run_config_type>, custom_error_type=None, custom_error_message=None, custom_error_context=None)]], skills: Optional[Dict[str, kiln_ai.datamodel.Skill]] = None)
22    def __init__(
23        self,
24        eval_config: EvalConfig,
25        run_config: RunConfigProperties | None,
26        skills: SkillsDict | None = None,
27    ):
28        self.eval_config = eval_config
29        eval = eval_config.parent_eval()
30        if not eval:
31            raise ValueError("Eval config must have a parent eval")
32        self.eval = eval
33        task = self.eval.parent_task()
34        if not task:
35            raise ValueError("Eval must have a parent task")
36        self.target_task = task
37        self.score_schema = BaseEval.build_score_schema(eval, allow_float_scores=True)
38        self.run_config = run_config
39        self.skills = skills
eval_config
eval
target_task
score_schema
run_config
skills
def model_and_provider(self) -> tuple[str, kiln_ai.datamodel.datamodel_enums.ModelProviderName]:
41    def model_and_provider(self) -> tuple[str, ModelProviderName]:
42        model_name = self.eval_config.model_name
43        provider = self.eval_config.model_provider
44        if (
45            not model_name
46            or not provider
47            or not isinstance(model_name, str)
48            or not isinstance(provider, str)
49            or provider not in ModelProviderName.__members__
50        ):
51            raise ValueError(
52                "Model name and provider must be set in the eval config model properties"
53            )
54
55        return model_name, ModelProviderName(provider)
async def run_task_and_eval( self, eval_job_item: kiln_ai.datamodel.TaskRun) -> tuple[kiln_ai.datamodel.TaskRun, typing.Dict[str, float], typing.Optional[typing.Dict[str, str]]]:
57    async def run_task_and_eval(
58        self, eval_job_item: TaskRun
59    ) -> tuple[TaskRun, EvalScores, Dict[str, str] | None]:
60        """
61        Runs the task on the provided run_config to generate fresh output, then runs the eval on that output.
62        """
63        input = eval_job_item.input
64        if self.run_config is None:
65            raise ValueError("Run config is required for run_task_and_eval")
66
67        run_adapter = adapter_for_task(
68            self.target_task,
69            self.run_config,
70            base_adapter_config=AdapterConfig(
71                allow_saving=False,
72                skills=self.skills,
73            ),
74        )
75
76        # Parse structured input if needed
77        parsed_input = input
78        if self.target_task.input_json_schema is not None:
79            parsed_input = json.loads(input)
80
81        # we don't save by default here. We'll save manually after validating the output
82        run_output = await run_adapter.invoke(parsed_input)
83
84        eval_output, intermediate_outputs = await self.run_eval(
85            run_output, eval_job_item
86        )
87
88        validate_schema_with_value_error(
89            eval_output, self.score_schema, "Eval output does not match score schema."
90        )
91
92        return run_output, eval_output, intermediate_outputs

Runs the task on the provided run_config to generate fresh output, then runs the eval on that output.

@abstractmethod
async def run_eval( self, task_run: kiln_ai.datamodel.TaskRun, eval_job_item: kiln_ai.datamodel.TaskRun | None = None) -> tuple[typing.Dict[str, float], typing.Optional[typing.Dict[str, str]]]:
 94    @abstractmethod
 95    async def run_eval(
 96        self, task_run: TaskRun, eval_job_item: TaskRun | None = None
 97    ) -> tuple[EvalScores, Dict[str, str] | None]:
 98        """
 99        Runs the eval on the given task run.
100
101        Returns a dictionary of scores which should conform to the score schema, and a dictionary of intermediate outputs (eval thinking).
102        """
103        pass

Runs the eval on the given task run.

Returns a dictionary of scores which should conform to the score schema, and a dictionary of intermediate outputs (eval thinking).

@classmethod
def build_score_schema( cls, eval: kiln_ai.datamodel.eval.Eval, allow_float_scores: bool = False) -> str:
105    @classmethod
106    def build_score_schema(cls, eval: Eval, allow_float_scores: bool = False) -> str:
107        """
108        Build a JSON schema for the scoring output of the task requirements
109
110        We allow 2 modes: allow_float_scores=True and allow_float_scores=False.
111
112        allow_float_scores=False is used for the call to the model, and forces the model into selecting into discrete rating options (int 1-5, pass-fail, etc).
113        allow_float_scores=True is used for final score output (for example, after we take a g-eval weighting of the model's logprobs). A pass/fail rating might return 0.75 for likely pass (as opposed to 0.99 for near certain pass), or a 1-5 score might return 3.75.
114        """
115
116        # Note: python maintains order, which is good as we want the user defined order, and overall last
117        properties = {}
118        for output_score in eval.output_scores:
119            output_score_json_key = output_score.json_key()
120
121            if len(output_score_json_key) == 0:
122                raise ValueError(
123                    f"Invalid output score name: {output_score.name}. Can not be used as JSON schema key."
124                )
125            property: dict[str, str | int | float | list[str] | list[int]] = {
126                "title": output_score.name,
127            }
128
129            match output_score.type:
130                case TaskOutputRatingType.five_star:
131                    if allow_float_scores:
132                        property["type"] = "number"
133                        property["minimum"] = 1
134                        property["maximum"] = 5
135                    else:
136                        property["type"] = "integer"
137                        property["minimum"] = 1
138                        property["maximum"] = 5
139
140                    property["description"] = (
141                        f"{output_score.instruction}\n\nThe rating should be between 1 and 5, with 1 being the worst and 5 being the best."
142                    )
143                case TaskOutputRatingType.pass_fail:
144                    if allow_float_scores:
145                        property["type"] = "number"
146                        property["minimum"] = 0
147                        property["maximum"] = 1
148                        property["description"] = (
149                            f"{output_score.instruction}\n\nThe rating should be between 0 and 1, with 0 being a failure and 1 being a pass."
150                        )
151                    else:
152                        property["enum"] = ["pass", "fail"]
153                        property["type"] = "string"
154                        property["description"] = (
155                            f"{output_score.instruction}\n\nThe rating should be either 'pass' or 'fail'."
156                        )
157                case TaskOutputRatingType.pass_fail_critical:
158                    if allow_float_scores:
159                        property["type"] = "number"
160                        property["minimum"] = -1
161                        property["maximum"] = 1
162                        property["description"] = (
163                            f"{output_score.instruction}\n\nThe rating should be between -1 and 1, with 1 being a pass, 0 being a failure, and -1 being a critical failure (very severe failure)."
164                        )
165                    else:
166                        property["enum"] = ["pass", "fail", "critical"]
167                        property["type"] = "string"
168                        property["description"] = (
169                            f"{output_score.instruction}\n\nThe rating should be either 'pass', 'fail', or 'critical' where critical a very severe failure."
170                        )
171                case TaskOutputRatingType.custom:
172                    # Skip custom rating types in evals
173                    continue
174                case _:
175                    raise_exhaustive_enum_error(output_score.type)
176
177            properties[output_score_json_key] = property
178
179        schema = {
180            "type": "object",
181            "properties": properties,
182            "required": list(properties.keys()),
183            "additionalProperties": False,
184        }
185        return json.dumps(schema, ensure_ascii=False)

Build a JSON schema for the scoring output of the task requirements

We allow 2 modes: allow_float_scores=True and allow_float_scores=False.

allow_float_scores=False is used for the call to the model, and forces the model into selecting into discrete rating options (int 1-5, pass-fail, etc). allow_float_scores=True is used for final score output (for example, after we take a g-eval weighting of the model's logprobs). A pass/fail rating might return 0.75 for likely pass (as opposed to 0.99 for near certain pass), or a 1-5 score might return 3.75.