kiln_ai.adapters.eval.base_eval

  1import json
  2from abc import abstractmethod
  3from typing import Dict
  4
  5from kiln_ai.adapters.adapter_registry import adapter_for_task
  6from kiln_ai.adapters.ml_model_list import ModelProviderName
  7from kiln_ai.adapters.model_adapters.base_adapter import AdapterConfig
  8from kiln_ai.datamodel.eval import Eval, EvalConfig, EvalScores
  9from kiln_ai.datamodel.json_schema import validate_schema_with_value_error
 10from kiln_ai.datamodel.task import RunConfigProperties, TaskOutputRatingType, TaskRun
 11from kiln_ai.utils.exhaustive_error import raise_exhaustive_enum_error
 12
 13
 14class BaseEval:
 15    """
 16    Base class for all evals/evaluators.
 17
 18    Should be subclassed, and the run_eval method implemented.
 19    """
 20
 21    def __init__(self, eval_config: EvalConfig, run_config: RunConfigProperties | None):
 22        self.eval_config = eval_config
 23        eval = eval_config.parent_eval()
 24        if not eval:
 25            raise ValueError("Eval config must have a parent eval")
 26        self.eval = eval
 27        task = self.eval.parent_task()
 28        if not task:
 29            raise ValueError("Eval must have a parent task")
 30        self.target_task = task
 31        self.score_schema = BaseEval.build_score_schema(eval, allow_float_scores=True)
 32        self.run_config = run_config
 33
 34    def model_and_provider(self) -> tuple[str, ModelProviderName]:
 35        model_name = self.eval_config.model_name
 36        provider = self.eval_config.model_provider
 37        if (
 38            not model_name
 39            or not provider
 40            or not isinstance(model_name, str)
 41            or not isinstance(provider, str)
 42            or provider not in ModelProviderName.__members__
 43        ):
 44            raise ValueError(
 45                "Model name and provider must be set in the eval config model properties"
 46            )
 47
 48        return model_name, ModelProviderName(provider)
 49
 50    async def run_task_and_eval(
 51        self, eval_job_item: TaskRun
 52    ) -> tuple[TaskRun, EvalScores, Dict[str, str] | None]:
 53        """
 54        Runs the task on the provided run_config to generate fresh output, then runs the eval on that output.
 55        """
 56        input = eval_job_item.input
 57        if self.run_config is None:
 58            raise ValueError("Run config is required for run_task_and_eval")
 59
 60        run_adapter = adapter_for_task(
 61            self.target_task,
 62            self.run_config,
 63            base_adapter_config=AdapterConfig(allow_saving=False),
 64        )
 65
 66        # Parse structured input if needed
 67        parsed_input = input
 68        if self.target_task.input_json_schema is not None:
 69            parsed_input = json.loads(input)
 70
 71        # we don't save by default here. We'll save manually after validating the output
 72        run_output = await run_adapter.invoke(parsed_input)
 73
 74        eval_output, intermediate_outputs = await self.run_eval(
 75            run_output, eval_job_item
 76        )
 77
 78        validate_schema_with_value_error(
 79            eval_output, self.score_schema, "Eval output does not match score schema."
 80        )
 81
 82        return run_output, eval_output, intermediate_outputs
 83
 84    @abstractmethod
 85    async def run_eval(
 86        self, task_run: TaskRun, eval_job_item: TaskRun | None = None
 87    ) -> tuple[EvalScores, Dict[str, str] | None]:
 88        """
 89        Runs the eval on the given task run.
 90
 91        Returns a dictionary of scores which should conform to the score schema, and a dictionary of intermediate outputs (eval thinking).
 92        """
 93        pass
 94
 95    @classmethod
 96    def build_score_schema(cls, eval: Eval, allow_float_scores: bool = False) -> str:
 97        """
 98        Build a JSON schema for the scoring output of the task requirements
 99
100        We allow 2 modes: allow_float_scores=True and allow_float_scores=False.
101
102        allow_float_scores=False is used for the call to the model, and forces the model into selecting into discrete rating options (int 1-5, pass-fail, etc).
103        allow_float_scores=True is used for final score output (for example, after we take a g-eval weighting of the model's logprobs). A pass/fail rating might return 0.75 for likely pass (as opposed to 0.99 for near certain pass), or a 1-5 score might return 3.75.
104        """
105
106        # Note: python maintains order, which is good as we want the user defined order, and overall last
107        properties = {}
108        for output_score in eval.output_scores:
109            output_score_json_key = output_score.json_key()
110
111            if len(output_score_json_key) == 0:
112                raise ValueError(
113                    f"Invalid output score name: {output_score.name}. Can not be used as JSON schema key."
114                )
115            property: dict[str, str | int | float | list[str] | list[int]] = {
116                "title": output_score.name,
117            }
118            match output_score.type:
119                case TaskOutputRatingType.five_star:
120                    if allow_float_scores:
121                        property["type"] = "number"
122                        property["minimum"] = 1
123                        property["maximum"] = 5
124                    else:
125                        property["type"] = "integer"
126                        property["minimum"] = 1
127                        property["maximum"] = 5
128
129                    property["description"] = (
130                        f"{output_score.instruction}\n\nThe rating should be between 1 and 5, with 1 being the worst and 5 being the best."
131                    )
132                case TaskOutputRatingType.pass_fail:
133                    if allow_float_scores:
134                        property["type"] = "number"
135                        property["minimum"] = 0
136                        property["maximum"] = 1
137                        property["description"] = (
138                            f"{output_score.instruction}\n\nThe rating should be between 0 and 1, with 0 being a failure and 1 being a pass."
139                        )
140                    else:
141                        property["enum"] = ["pass", "fail"]
142                        property["type"] = "string"
143                        property["description"] = (
144                            f"{output_score.instruction}\n\nThe rating should be either 'pass' or 'fail'."
145                        )
146                case TaskOutputRatingType.pass_fail_critical:
147                    if allow_float_scores:
148                        property["type"] = "number"
149                        property["minimum"] = -1
150                        property["maximum"] = 1
151                        property["description"] = (
152                            f"{output_score.instruction}\n\nThe rating should be between -1 and 1, with 1 being a pass, 0 being a failure, and -1 being a critical failure (very severe failure)."
153                        )
154                    else:
155                        property["enum"] = ["pass", "fail", "critical"]
156                        property["type"] = "string"
157                        property["description"] = (
158                            f"{output_score.instruction}\n\nThe rating should be either 'pass', 'fail', or 'critical' where critical a very severe failure."
159                        )
160                case TaskOutputRatingType.custom:
161                    # Skip custom rating types in evals
162                    continue
163                case _:
164                    raise_exhaustive_enum_error(output_score.type)
165
166            properties[output_score_json_key] = property
167
168        schema = {
169            "type": "object",
170            "properties": properties,
171            "required": list(properties.keys()),
172        }
173        return json.dumps(schema, ensure_ascii=False)
class BaseEval:
 15class BaseEval:
 16    """
 17    Base class for all evals/evaluators.
 18
 19    Should be subclassed, and the run_eval method implemented.
 20    """
 21
 22    def __init__(self, eval_config: EvalConfig, run_config: RunConfigProperties | None):
 23        self.eval_config = eval_config
 24        eval = eval_config.parent_eval()
 25        if not eval:
 26            raise ValueError("Eval config must have a parent eval")
 27        self.eval = eval
 28        task = self.eval.parent_task()
 29        if not task:
 30            raise ValueError("Eval must have a parent task")
 31        self.target_task = task
 32        self.score_schema = BaseEval.build_score_schema(eval, allow_float_scores=True)
 33        self.run_config = run_config
 34
 35    def model_and_provider(self) -> tuple[str, ModelProviderName]:
 36        model_name = self.eval_config.model_name
 37        provider = self.eval_config.model_provider
 38        if (
 39            not model_name
 40            or not provider
 41            or not isinstance(model_name, str)
 42            or not isinstance(provider, str)
 43            or provider not in ModelProviderName.__members__
 44        ):
 45            raise ValueError(
 46                "Model name and provider must be set in the eval config model properties"
 47            )
 48
 49        return model_name, ModelProviderName(provider)
 50
 51    async def run_task_and_eval(
 52        self, eval_job_item: TaskRun
 53    ) -> tuple[TaskRun, EvalScores, Dict[str, str] | None]:
 54        """
 55        Runs the task on the provided run_config to generate fresh output, then runs the eval on that output.
 56        """
 57        input = eval_job_item.input
 58        if self.run_config is None:
 59            raise ValueError("Run config is required for run_task_and_eval")
 60
 61        run_adapter = adapter_for_task(
 62            self.target_task,
 63            self.run_config,
 64            base_adapter_config=AdapterConfig(allow_saving=False),
 65        )
 66
 67        # Parse structured input if needed
 68        parsed_input = input
 69        if self.target_task.input_json_schema is not None:
 70            parsed_input = json.loads(input)
 71
 72        # we don't save by default here. We'll save manually after validating the output
 73        run_output = await run_adapter.invoke(parsed_input)
 74
 75        eval_output, intermediate_outputs = await self.run_eval(
 76            run_output, eval_job_item
 77        )
 78
 79        validate_schema_with_value_error(
 80            eval_output, self.score_schema, "Eval output does not match score schema."
 81        )
 82
 83        return run_output, eval_output, intermediate_outputs
 84
 85    @abstractmethod
 86    async def run_eval(
 87        self, task_run: TaskRun, eval_job_item: TaskRun | None = None
 88    ) -> tuple[EvalScores, Dict[str, str] | None]:
 89        """
 90        Runs the eval on the given task run.
 91
 92        Returns a dictionary of scores which should conform to the score schema, and a dictionary of intermediate outputs (eval thinking).
 93        """
 94        pass
 95
 96    @classmethod
 97    def build_score_schema(cls, eval: Eval, allow_float_scores: bool = False) -> str:
 98        """
 99        Build a JSON schema for the scoring output of the task requirements
100
101        We allow 2 modes: allow_float_scores=True and allow_float_scores=False.
102
103        allow_float_scores=False is used for the call to the model, and forces the model into selecting into discrete rating options (int 1-5, pass-fail, etc).
104        allow_float_scores=True is used for final score output (for example, after we take a g-eval weighting of the model's logprobs). A pass/fail rating might return 0.75 for likely pass (as opposed to 0.99 for near certain pass), or a 1-5 score might return 3.75.
105        """
106
107        # Note: python maintains order, which is good as we want the user defined order, and overall last
108        properties = {}
109        for output_score in eval.output_scores:
110            output_score_json_key = output_score.json_key()
111
112            if len(output_score_json_key) == 0:
113                raise ValueError(
114                    f"Invalid output score name: {output_score.name}. Can not be used as JSON schema key."
115                )
116            property: dict[str, str | int | float | list[str] | list[int]] = {
117                "title": output_score.name,
118            }
119            match output_score.type:
120                case TaskOutputRatingType.five_star:
121                    if allow_float_scores:
122                        property["type"] = "number"
123                        property["minimum"] = 1
124                        property["maximum"] = 5
125                    else:
126                        property["type"] = "integer"
127                        property["minimum"] = 1
128                        property["maximum"] = 5
129
130                    property["description"] = (
131                        f"{output_score.instruction}\n\nThe rating should be between 1 and 5, with 1 being the worst and 5 being the best."
132                    )
133                case TaskOutputRatingType.pass_fail:
134                    if allow_float_scores:
135                        property["type"] = "number"
136                        property["minimum"] = 0
137                        property["maximum"] = 1
138                        property["description"] = (
139                            f"{output_score.instruction}\n\nThe rating should be between 0 and 1, with 0 being a failure and 1 being a pass."
140                        )
141                    else:
142                        property["enum"] = ["pass", "fail"]
143                        property["type"] = "string"
144                        property["description"] = (
145                            f"{output_score.instruction}\n\nThe rating should be either 'pass' or 'fail'."
146                        )
147                case TaskOutputRatingType.pass_fail_critical:
148                    if allow_float_scores:
149                        property["type"] = "number"
150                        property["minimum"] = -1
151                        property["maximum"] = 1
152                        property["description"] = (
153                            f"{output_score.instruction}\n\nThe rating should be between -1 and 1, with 1 being a pass, 0 being a failure, and -1 being a critical failure (very severe failure)."
154                        )
155                    else:
156                        property["enum"] = ["pass", "fail", "critical"]
157                        property["type"] = "string"
158                        property["description"] = (
159                            f"{output_score.instruction}\n\nThe rating should be either 'pass', 'fail', or 'critical' where critical a very severe failure."
160                        )
161                case TaskOutputRatingType.custom:
162                    # Skip custom rating types in evals
163                    continue
164                case _:
165                    raise_exhaustive_enum_error(output_score.type)
166
167            properties[output_score_json_key] = property
168
169        schema = {
170            "type": "object",
171            "properties": properties,
172            "required": list(properties.keys()),
173        }
174        return json.dumps(schema, ensure_ascii=False)

Base class for all evals/evaluators.

Should be subclassed, and the run_eval method implemented.

BaseEval( eval_config: kiln_ai.datamodel.eval.EvalConfig, run_config: kiln_ai.datamodel.run_config.RunConfigProperties | None)
22    def __init__(self, eval_config: EvalConfig, run_config: RunConfigProperties | None):
23        self.eval_config = eval_config
24        eval = eval_config.parent_eval()
25        if not eval:
26            raise ValueError("Eval config must have a parent eval")
27        self.eval = eval
28        task = self.eval.parent_task()
29        if not task:
30            raise ValueError("Eval must have a parent task")
31        self.target_task = task
32        self.score_schema = BaseEval.build_score_schema(eval, allow_float_scores=True)
33        self.run_config = run_config
eval_config
eval
target_task
score_schema
run_config
def model_and_provider(self) -> tuple[str, kiln_ai.datamodel.datamodel_enums.ModelProviderName]:
35    def model_and_provider(self) -> tuple[str, ModelProviderName]:
36        model_name = self.eval_config.model_name
37        provider = self.eval_config.model_provider
38        if (
39            not model_name
40            or not provider
41            or not isinstance(model_name, str)
42            or not isinstance(provider, str)
43            or provider not in ModelProviderName.__members__
44        ):
45            raise ValueError(
46                "Model name and provider must be set in the eval config model properties"
47            )
48
49        return model_name, ModelProviderName(provider)
async def run_task_and_eval( self, eval_job_item: kiln_ai.datamodel.TaskRun) -> tuple[kiln_ai.datamodel.TaskRun, typing.Dict[str, float], typing.Optional[typing.Dict[str, str]]]:
51    async def run_task_and_eval(
52        self, eval_job_item: TaskRun
53    ) -> tuple[TaskRun, EvalScores, Dict[str, str] | None]:
54        """
55        Runs the task on the provided run_config to generate fresh output, then runs the eval on that output.
56        """
57        input = eval_job_item.input
58        if self.run_config is None:
59            raise ValueError("Run config is required for run_task_and_eval")
60
61        run_adapter = adapter_for_task(
62            self.target_task,
63            self.run_config,
64            base_adapter_config=AdapterConfig(allow_saving=False),
65        )
66
67        # Parse structured input if needed
68        parsed_input = input
69        if self.target_task.input_json_schema is not None:
70            parsed_input = json.loads(input)
71
72        # we don't save by default here. We'll save manually after validating the output
73        run_output = await run_adapter.invoke(parsed_input)
74
75        eval_output, intermediate_outputs = await self.run_eval(
76            run_output, eval_job_item
77        )
78
79        validate_schema_with_value_error(
80            eval_output, self.score_schema, "Eval output does not match score schema."
81        )
82
83        return run_output, eval_output, intermediate_outputs

Runs the task on the provided run_config to generate fresh output, then runs the eval on that output.

@abstractmethod
async def run_eval( self, task_run: kiln_ai.datamodel.TaskRun, eval_job_item: kiln_ai.datamodel.TaskRun | None = None) -> tuple[typing.Dict[str, float], typing.Optional[typing.Dict[str, str]]]:
85    @abstractmethod
86    async def run_eval(
87        self, task_run: TaskRun, eval_job_item: TaskRun | None = None
88    ) -> tuple[EvalScores, Dict[str, str] | None]:
89        """
90        Runs the eval on the given task run.
91
92        Returns a dictionary of scores which should conform to the score schema, and a dictionary of intermediate outputs (eval thinking).
93        """
94        pass

Runs the eval on the given task run.

Returns a dictionary of scores which should conform to the score schema, and a dictionary of intermediate outputs (eval thinking).

@classmethod
def build_score_schema( cls, eval: kiln_ai.datamodel.eval.Eval, allow_float_scores: bool = False) -> str:
 96    @classmethod
 97    def build_score_schema(cls, eval: Eval, allow_float_scores: bool = False) -> str:
 98        """
 99        Build a JSON schema for the scoring output of the task requirements
100
101        We allow 2 modes: allow_float_scores=True and allow_float_scores=False.
102
103        allow_float_scores=False is used for the call to the model, and forces the model into selecting into discrete rating options (int 1-5, pass-fail, etc).
104        allow_float_scores=True is used for final score output (for example, after we take a g-eval weighting of the model's logprobs). A pass/fail rating might return 0.75 for likely pass (as opposed to 0.99 for near certain pass), or a 1-5 score might return 3.75.
105        """
106
107        # Note: python maintains order, which is good as we want the user defined order, and overall last
108        properties = {}
109        for output_score in eval.output_scores:
110            output_score_json_key = output_score.json_key()
111
112            if len(output_score_json_key) == 0:
113                raise ValueError(
114                    f"Invalid output score name: {output_score.name}. Can not be used as JSON schema key."
115                )
116            property: dict[str, str | int | float | list[str] | list[int]] = {
117                "title": output_score.name,
118            }
119            match output_score.type:
120                case TaskOutputRatingType.five_star:
121                    if allow_float_scores:
122                        property["type"] = "number"
123                        property["minimum"] = 1
124                        property["maximum"] = 5
125                    else:
126                        property["type"] = "integer"
127                        property["minimum"] = 1
128                        property["maximum"] = 5
129
130                    property["description"] = (
131                        f"{output_score.instruction}\n\nThe rating should be between 1 and 5, with 1 being the worst and 5 being the best."
132                    )
133                case TaskOutputRatingType.pass_fail:
134                    if allow_float_scores:
135                        property["type"] = "number"
136                        property["minimum"] = 0
137                        property["maximum"] = 1
138                        property["description"] = (
139                            f"{output_score.instruction}\n\nThe rating should be between 0 and 1, with 0 being a failure and 1 being a pass."
140                        )
141                    else:
142                        property["enum"] = ["pass", "fail"]
143                        property["type"] = "string"
144                        property["description"] = (
145                            f"{output_score.instruction}\n\nThe rating should be either 'pass' or 'fail'."
146                        )
147                case TaskOutputRatingType.pass_fail_critical:
148                    if allow_float_scores:
149                        property["type"] = "number"
150                        property["minimum"] = -1
151                        property["maximum"] = 1
152                        property["description"] = (
153                            f"{output_score.instruction}\n\nThe rating should be between -1 and 1, with 1 being a pass, 0 being a failure, and -1 being a critical failure (very severe failure)."
154                        )
155                    else:
156                        property["enum"] = ["pass", "fail", "critical"]
157                        property["type"] = "string"
158                        property["description"] = (
159                            f"{output_score.instruction}\n\nThe rating should be either 'pass', 'fail', or 'critical' where critical a very severe failure."
160                        )
161                case TaskOutputRatingType.custom:
162                    # Skip custom rating types in evals
163                    continue
164                case _:
165                    raise_exhaustive_enum_error(output_score.type)
166
167            properties[output_score_json_key] = property
168
169        schema = {
170            "type": "object",
171            "properties": properties,
172            "required": list(properties.keys()),
173        }
174        return json.dumps(schema, ensure_ascii=False)

Build a JSON schema for the scoring output of the task requirements

We allow 2 modes: allow_float_scores=True and allow_float_scores=False.

allow_float_scores=False is used for the call to the model, and forces the model into selecting into discrete rating options (int 1-5, pass-fail, etc). allow_float_scores=True is used for final score output (for example, after we take a g-eval weighting of the model's logprobs). A pass/fail rating might return 0.75 for likely pass (as opposed to 0.99 for near certain pass), or a 1-5 score might return 3.75.