kiln_ai.adapters.eval.base_eval

  1import json
  2from abc import abstractmethod
  3from typing import Dict
  4
  5from kiln_ai.adapters.adapter_registry import adapter_for_task
  6from kiln_ai.adapters.ml_model_list import ModelProviderName
  7from kiln_ai.adapters.model_adapters.base_adapter import AdapterConfig
  8from kiln_ai.datamodel.eval import Eval, EvalConfig, EvalScores
  9from kiln_ai.datamodel.json_schema import validate_schema_with_value_error
 10from kiln_ai.datamodel.task import (
 11    RunConfig,
 12    RunConfigProperties,
 13    TaskOutputRatingType,
 14    TaskRun,
 15)
 16from kiln_ai.utils.exhaustive_error import raise_exhaustive_enum_error
 17
 18
 19class BaseEval:
 20    """
 21    Base class for all evals/evaluators.
 22
 23    Should be subclassed, and the run_eval method implemented.
 24    """
 25
 26    def __init__(self, eval_config: EvalConfig, run_config: RunConfig | None):
 27        self.eval_config = eval_config
 28        eval = eval_config.parent_eval()
 29        if not eval:
 30            raise ValueError("Eval config must have a parent eval")
 31        self.eval = eval
 32        task = self.eval.parent_task()
 33        if not task:
 34            raise ValueError("Eval must have a parent task")
 35        self.target_task = task
 36        self.score_schema = BaseEval.build_score_schema(eval, allow_float_scores=True)
 37        self.run_config = run_config
 38
 39    def model_and_provider(self) -> tuple[str, ModelProviderName]:
 40        model_name = self.eval_config.model_name
 41        provider = self.eval_config.model_provider
 42        if (
 43            not model_name
 44            or not provider
 45            or not isinstance(model_name, str)
 46            or not isinstance(provider, str)
 47            or provider not in ModelProviderName.__members__
 48        ):
 49            raise ValueError(
 50                "Model name and provider must be set in the eval config model properties"
 51            )
 52
 53        return model_name, ModelProviderName(provider)
 54
 55    async def run_task_and_eval(
 56        self, input: str
 57    ) -> tuple[TaskRun, EvalScores, Dict[str, str] | None]:
 58        """
 59        Runs the task on the provided run_config to generate fresh output, then runs the eval on that output.
 60        """
 61        if self.run_config is None:
 62            raise ValueError("Run config is required for run_task_and_eval")
 63
 64        run_adapter = adapter_for_task(
 65            self.target_task,
 66            self.run_config,
 67            base_adapter_config=AdapterConfig(allow_saving=False),
 68        )
 69
 70        # Parse structured input if needed
 71        parsed_input = input
 72        if self.target_task.input_json_schema is not None:
 73            parsed_input = json.loads(input)
 74
 75        # we don't save by default here. We'll save manually after validating the output
 76        run_output = await run_adapter.invoke(parsed_input)
 77
 78        eval_output, intermediate_outputs = await self.run_eval(run_output)
 79
 80        validate_schema_with_value_error(
 81            eval_output, self.score_schema, "Eval output does not match score schema."
 82        )
 83
 84        return run_output, eval_output, intermediate_outputs
 85
 86    @abstractmethod
 87    async def run_eval(
 88        self, task_run: TaskRun
 89    ) -> tuple[EvalScores, Dict[str, str] | None]:
 90        """
 91        Runs the eval on the given task run.
 92
 93        Returns a dictionary of scores which should conform to the score schema, and a dictionary of intermediate outputs (eval thinking).
 94        """
 95        pass
 96
 97    @classmethod
 98    def build_score_schema(cls, eval: Eval, allow_float_scores: bool = False) -> str:
 99        """
100        Build a JSON schema for the scoring output of the task requirements
101
102        We allow 2 modes: allow_float_scores=True and allow_float_scores=False.
103
104        allow_float_scores=False is used for the call to the model, and forces the model into selecting into discrete rating options (int 1-5, pass-fail, etc).
105        allow_float_scores=True is used for final score output (for example, after we take a g-eval weighting of the model's logprobs). A pass/fail rating might return 0.75 for likely pass (as opposed to 0.99 for near certain pass), or a 1-5 score might return 3.75.
106        """
107
108        # Note: python maintains order, which is good as we want the user defined order, and overall last
109        properties = {}
110        for output_score in eval.output_scores:
111            output_score_json_key = output_score.json_key()
112
113            if len(output_score_json_key) == 0:
114                raise ValueError(
115                    f"Invalid output score name: {output_score.name}. Can not be used as JSON schema key."
116                )
117            property: dict[str, str | int | float | list[str] | list[int]] = {
118                "title": output_score.name,
119            }
120            match output_score.type:
121                case TaskOutputRatingType.five_star:
122                    if allow_float_scores:
123                        property["type"] = "number"
124                        property["minimum"] = 1
125                        property["maximum"] = 5
126                    else:
127                        property["enum"] = [1, 2, 3, 4, 5]
128
129                    property["description"] = (
130                        f"{output_score.instruction}\n\nThe rating should be between 1 and 5, with 1 being the worst and 5 being the best."
131                    )
132                case TaskOutputRatingType.pass_fail:
133                    if allow_float_scores:
134                        property["type"] = "number"
135                        property["minimum"] = 0
136                        property["maximum"] = 1
137                        property["description"] = (
138                            f"{output_score.instruction}\n\nThe rating should be between 0 and 1, with 0 being a failure and 1 being a pass."
139                        )
140                    else:
141                        property["enum"] = ["pass", "fail"]
142                        property["description"] = (
143                            f"{output_score.instruction}\n\nThe rating should be either 'pass' or 'fail'."
144                        )
145                case TaskOutputRatingType.pass_fail_critical:
146                    if allow_float_scores:
147                        property["type"] = "number"
148                        property["minimum"] = -1
149                        property["maximum"] = 1
150                        property["description"] = (
151                            f"{output_score.instruction}\n\nThe rating should be between -1 and 1, with 1 being a pass, 0 being a failure, and -1 being a critical failure (very severe failure)."
152                        )
153                    else:
154                        property["enum"] = ["pass", "fail", "critical"]
155                        property["description"] = (
156                            f"{output_score.instruction}\n\nThe rating should be either 'pass', 'fail', or 'critical' where critical a very severe failure."
157                        )
158                case TaskOutputRatingType.custom:
159                    # Skip custom rating types in evals
160                    continue
161                case _:
162                    raise_exhaustive_enum_error(output_score.type)
163
164            properties[output_score_json_key] = property
165
166        schema = {
167            "type": "object",
168            "properties": properties,
169            "required": list(properties.keys()),
170        }
171        return json.dumps(schema, ensure_ascii=False)
class BaseEval:
 20class BaseEval:
 21    """
 22    Base class for all evals/evaluators.
 23
 24    Should be subclassed, and the run_eval method implemented.
 25    """
 26
 27    def __init__(self, eval_config: EvalConfig, run_config: RunConfig | None):
 28        self.eval_config = eval_config
 29        eval = eval_config.parent_eval()
 30        if not eval:
 31            raise ValueError("Eval config must have a parent eval")
 32        self.eval = eval
 33        task = self.eval.parent_task()
 34        if not task:
 35            raise ValueError("Eval must have a parent task")
 36        self.target_task = task
 37        self.score_schema = BaseEval.build_score_schema(eval, allow_float_scores=True)
 38        self.run_config = run_config
 39
 40    def model_and_provider(self) -> tuple[str, ModelProviderName]:
 41        model_name = self.eval_config.model_name
 42        provider = self.eval_config.model_provider
 43        if (
 44            not model_name
 45            or not provider
 46            or not isinstance(model_name, str)
 47            or not isinstance(provider, str)
 48            or provider not in ModelProviderName.__members__
 49        ):
 50            raise ValueError(
 51                "Model name and provider must be set in the eval config model properties"
 52            )
 53
 54        return model_name, ModelProviderName(provider)
 55
 56    async def run_task_and_eval(
 57        self, input: str
 58    ) -> tuple[TaskRun, EvalScores, Dict[str, str] | None]:
 59        """
 60        Runs the task on the provided run_config to generate fresh output, then runs the eval on that output.
 61        """
 62        if self.run_config is None:
 63            raise ValueError("Run config is required for run_task_and_eval")
 64
 65        run_adapter = adapter_for_task(
 66            self.target_task,
 67            self.run_config,
 68            base_adapter_config=AdapterConfig(allow_saving=False),
 69        )
 70
 71        # Parse structured input if needed
 72        parsed_input = input
 73        if self.target_task.input_json_schema is not None:
 74            parsed_input = json.loads(input)
 75
 76        # we don't save by default here. We'll save manually after validating the output
 77        run_output = await run_adapter.invoke(parsed_input)
 78
 79        eval_output, intermediate_outputs = await self.run_eval(run_output)
 80
 81        validate_schema_with_value_error(
 82            eval_output, self.score_schema, "Eval output does not match score schema."
 83        )
 84
 85        return run_output, eval_output, intermediate_outputs
 86
 87    @abstractmethod
 88    async def run_eval(
 89        self, task_run: TaskRun
 90    ) -> tuple[EvalScores, Dict[str, str] | None]:
 91        """
 92        Runs the eval on the given task run.
 93
 94        Returns a dictionary of scores which should conform to the score schema, and a dictionary of intermediate outputs (eval thinking).
 95        """
 96        pass
 97
 98    @classmethod
 99    def build_score_schema(cls, eval: Eval, allow_float_scores: bool = False) -> str:
100        """
101        Build a JSON schema for the scoring output of the task requirements
102
103        We allow 2 modes: allow_float_scores=True and allow_float_scores=False.
104
105        allow_float_scores=False is used for the call to the model, and forces the model into selecting into discrete rating options (int 1-5, pass-fail, etc).
106        allow_float_scores=True is used for final score output (for example, after we take a g-eval weighting of the model's logprobs). A pass/fail rating might return 0.75 for likely pass (as opposed to 0.99 for near certain pass), or a 1-5 score might return 3.75.
107        """
108
109        # Note: python maintains order, which is good as we want the user defined order, and overall last
110        properties = {}
111        for output_score in eval.output_scores:
112            output_score_json_key = output_score.json_key()
113
114            if len(output_score_json_key) == 0:
115                raise ValueError(
116                    f"Invalid output score name: {output_score.name}. Can not be used as JSON schema key."
117                )
118            property: dict[str, str | int | float | list[str] | list[int]] = {
119                "title": output_score.name,
120            }
121            match output_score.type:
122                case TaskOutputRatingType.five_star:
123                    if allow_float_scores:
124                        property["type"] = "number"
125                        property["minimum"] = 1
126                        property["maximum"] = 5
127                    else:
128                        property["enum"] = [1, 2, 3, 4, 5]
129
130                    property["description"] = (
131                        f"{output_score.instruction}\n\nThe rating should be between 1 and 5, with 1 being the worst and 5 being the best."
132                    )
133                case TaskOutputRatingType.pass_fail:
134                    if allow_float_scores:
135                        property["type"] = "number"
136                        property["minimum"] = 0
137                        property["maximum"] = 1
138                        property["description"] = (
139                            f"{output_score.instruction}\n\nThe rating should be between 0 and 1, with 0 being a failure and 1 being a pass."
140                        )
141                    else:
142                        property["enum"] = ["pass", "fail"]
143                        property["description"] = (
144                            f"{output_score.instruction}\n\nThe rating should be either 'pass' or 'fail'."
145                        )
146                case TaskOutputRatingType.pass_fail_critical:
147                    if allow_float_scores:
148                        property["type"] = "number"
149                        property["minimum"] = -1
150                        property["maximum"] = 1
151                        property["description"] = (
152                            f"{output_score.instruction}\n\nThe rating should be between -1 and 1, with 1 being a pass, 0 being a failure, and -1 being a critical failure (very severe failure)."
153                        )
154                    else:
155                        property["enum"] = ["pass", "fail", "critical"]
156                        property["description"] = (
157                            f"{output_score.instruction}\n\nThe rating should be either 'pass', 'fail', or 'critical' where critical a very severe failure."
158                        )
159                case TaskOutputRatingType.custom:
160                    # Skip custom rating types in evals
161                    continue
162                case _:
163                    raise_exhaustive_enum_error(output_score.type)
164
165            properties[output_score_json_key] = property
166
167        schema = {
168            "type": "object",
169            "properties": properties,
170            "required": list(properties.keys()),
171        }
172        return json.dumps(schema, ensure_ascii=False)

Base class for all evals/evaluators.

Should be subclassed, and the run_eval method implemented.

BaseEval( eval_config: kiln_ai.datamodel.eval.EvalConfig, run_config: kiln_ai.datamodel.task.RunConfig | None)
27    def __init__(self, eval_config: EvalConfig, run_config: RunConfig | None):
28        self.eval_config = eval_config
29        eval = eval_config.parent_eval()
30        if not eval:
31            raise ValueError("Eval config must have a parent eval")
32        self.eval = eval
33        task = self.eval.parent_task()
34        if not task:
35            raise ValueError("Eval must have a parent task")
36        self.target_task = task
37        self.score_schema = BaseEval.build_score_schema(eval, allow_float_scores=True)
38        self.run_config = run_config
eval_config
eval
target_task
score_schema
run_config
def model_and_provider(self) -> tuple[str, kiln_ai.datamodel.datamodel_enums.ModelProviderName]:
40    def model_and_provider(self) -> tuple[str, ModelProviderName]:
41        model_name = self.eval_config.model_name
42        provider = self.eval_config.model_provider
43        if (
44            not model_name
45            or not provider
46            or not isinstance(model_name, str)
47            or not isinstance(provider, str)
48            or provider not in ModelProviderName.__members__
49        ):
50            raise ValueError(
51                "Model name and provider must be set in the eval config model properties"
52            )
53
54        return model_name, ModelProviderName(provider)
async def run_task_and_eval( self, input: str) -> tuple[kiln_ai.datamodel.TaskRun, typing.Dict[str, float], typing.Optional[typing.Dict[str, str]]]:
56    async def run_task_and_eval(
57        self, input: str
58    ) -> tuple[TaskRun, EvalScores, Dict[str, str] | None]:
59        """
60        Runs the task on the provided run_config to generate fresh output, then runs the eval on that output.
61        """
62        if self.run_config is None:
63            raise ValueError("Run config is required for run_task_and_eval")
64
65        run_adapter = adapter_for_task(
66            self.target_task,
67            self.run_config,
68            base_adapter_config=AdapterConfig(allow_saving=False),
69        )
70
71        # Parse structured input if needed
72        parsed_input = input
73        if self.target_task.input_json_schema is not None:
74            parsed_input = json.loads(input)
75
76        # we don't save by default here. We'll save manually after validating the output
77        run_output = await run_adapter.invoke(parsed_input)
78
79        eval_output, intermediate_outputs = await self.run_eval(run_output)
80
81        validate_schema_with_value_error(
82            eval_output, self.score_schema, "Eval output does not match score schema."
83        )
84
85        return run_output, eval_output, intermediate_outputs

Runs the task on the provided run_config to generate fresh output, then runs the eval on that output.

@abstractmethod
async def run_eval( self, task_run: kiln_ai.datamodel.TaskRun) -> tuple[typing.Dict[str, float], typing.Optional[typing.Dict[str, str]]]:
87    @abstractmethod
88    async def run_eval(
89        self, task_run: TaskRun
90    ) -> tuple[EvalScores, Dict[str, str] | None]:
91        """
92        Runs the eval on the given task run.
93
94        Returns a dictionary of scores which should conform to the score schema, and a dictionary of intermediate outputs (eval thinking).
95        """
96        pass

Runs the eval on the given task run.

Returns a dictionary of scores which should conform to the score schema, and a dictionary of intermediate outputs (eval thinking).

@classmethod
def build_score_schema( cls, eval: kiln_ai.datamodel.eval.Eval, allow_float_scores: bool = False) -> str:
 98    @classmethod
 99    def build_score_schema(cls, eval: Eval, allow_float_scores: bool = False) -> str:
100        """
101        Build a JSON schema for the scoring output of the task requirements
102
103        We allow 2 modes: allow_float_scores=True and allow_float_scores=False.
104
105        allow_float_scores=False is used for the call to the model, and forces the model into selecting into discrete rating options (int 1-5, pass-fail, etc).
106        allow_float_scores=True is used for final score output (for example, after we take a g-eval weighting of the model's logprobs). A pass/fail rating might return 0.75 for likely pass (as opposed to 0.99 for near certain pass), or a 1-5 score might return 3.75.
107        """
108
109        # Note: python maintains order, which is good as we want the user defined order, and overall last
110        properties = {}
111        for output_score in eval.output_scores:
112            output_score_json_key = output_score.json_key()
113
114            if len(output_score_json_key) == 0:
115                raise ValueError(
116                    f"Invalid output score name: {output_score.name}. Can not be used as JSON schema key."
117                )
118            property: dict[str, str | int | float | list[str] | list[int]] = {
119                "title": output_score.name,
120            }
121            match output_score.type:
122                case TaskOutputRatingType.five_star:
123                    if allow_float_scores:
124                        property["type"] = "number"
125                        property["minimum"] = 1
126                        property["maximum"] = 5
127                    else:
128                        property["enum"] = [1, 2, 3, 4, 5]
129
130                    property["description"] = (
131                        f"{output_score.instruction}\n\nThe rating should be between 1 and 5, with 1 being the worst and 5 being the best."
132                    )
133                case TaskOutputRatingType.pass_fail:
134                    if allow_float_scores:
135                        property["type"] = "number"
136                        property["minimum"] = 0
137                        property["maximum"] = 1
138                        property["description"] = (
139                            f"{output_score.instruction}\n\nThe rating should be between 0 and 1, with 0 being a failure and 1 being a pass."
140                        )
141                    else:
142                        property["enum"] = ["pass", "fail"]
143                        property["description"] = (
144                            f"{output_score.instruction}\n\nThe rating should be either 'pass' or 'fail'."
145                        )
146                case TaskOutputRatingType.pass_fail_critical:
147                    if allow_float_scores:
148                        property["type"] = "number"
149                        property["minimum"] = -1
150                        property["maximum"] = 1
151                        property["description"] = (
152                            f"{output_score.instruction}\n\nThe rating should be between -1 and 1, with 1 being a pass, 0 being a failure, and -1 being a critical failure (very severe failure)."
153                        )
154                    else:
155                        property["enum"] = ["pass", "fail", "critical"]
156                        property["description"] = (
157                            f"{output_score.instruction}\n\nThe rating should be either 'pass', 'fail', or 'critical' where critical a very severe failure."
158                        )
159                case TaskOutputRatingType.custom:
160                    # Skip custom rating types in evals
161                    continue
162                case _:
163                    raise_exhaustive_enum_error(output_score.type)
164
165            properties[output_score_json_key] = property
166
167        schema = {
168            "type": "object",
169            "properties": properties,
170            "required": list(properties.keys()),
171        }
172        return json.dumps(schema, ensure_ascii=False)

Build a JSON schema for the scoring output of the task requirements

We allow 2 modes: allow_float_scores=True and allow_float_scores=False.

allow_float_scores=False is used for the call to the model, and forces the model into selecting into discrete rating options (int 1-5, pass-fail, etc). allow_float_scores=True is used for final score output (for example, after we take a g-eval weighting of the model's logprobs). A pass/fail rating might return 0.75 for likely pass (as opposed to 0.99 for near certain pass), or a 1-5 score might return 3.75.