kiln_ai.adapters.eval.base_eval

  1import json
  2from abc import abstractmethod
  3from typing import Dict
  4
  5from kiln_ai.adapters.adapter_registry import adapter_for_task
  6from kiln_ai.adapters.ml_model_list import ModelProviderName
  7from kiln_ai.adapters.model_adapters.base_adapter import AdapterConfig
  8from kiln_ai.datamodel.eval import Eval, EvalConfig, EvalScores
  9from kiln_ai.datamodel.json_schema import validate_schema_with_value_error
 10from kiln_ai.datamodel.task import RunConfigProperties, TaskOutputRatingType, TaskRun
 11from kiln_ai.utils.exhaustive_error import raise_exhaustive_enum_error
 12
 13
 14class BaseEval:
 15    """
 16    Base class for all evals/evaluators.
 17
 18    Should be subclassed, and the run_eval method implemented.
 19    """
 20
 21    def __init__(self, eval_config: EvalConfig, run_config: RunConfigProperties | None):
 22        self.eval_config = eval_config
 23        eval = eval_config.parent_eval()
 24        if not eval:
 25            raise ValueError("Eval config must have a parent eval")
 26        self.eval = eval
 27        task = self.eval.parent_task()
 28        if not task:
 29            raise ValueError("Eval must have a parent task")
 30        self.target_task = task
 31        self.score_schema = BaseEval.build_score_schema(eval, allow_float_scores=True)
 32        self.run_config = run_config
 33
 34    def model_and_provider(self) -> tuple[str, ModelProviderName]:
 35        model_name = self.eval_config.model_name
 36        provider = self.eval_config.model_provider
 37        if (
 38            not model_name
 39            or not provider
 40            or not isinstance(model_name, str)
 41            or not isinstance(provider, str)
 42            or provider not in ModelProviderName.__members__
 43        ):
 44            raise ValueError(
 45                "Model name and provider must be set in the eval config model properties"
 46            )
 47
 48        return model_name, ModelProviderName(provider)
 49
 50    async def run_task_and_eval(
 51        self, input: str
 52    ) -> tuple[TaskRun, EvalScores, Dict[str, str] | None]:
 53        """
 54        Runs the task on the provided run_config to generate fresh output, then runs the eval on that output.
 55        """
 56        if self.run_config is None:
 57            raise ValueError("Run config is required for run_task_and_eval")
 58
 59        run_adapter = adapter_for_task(
 60            self.target_task,
 61            self.run_config,
 62            base_adapter_config=AdapterConfig(allow_saving=False),
 63        )
 64
 65        # Parse structured input if needed
 66        parsed_input = input
 67        if self.target_task.input_json_schema is not None:
 68            parsed_input = json.loads(input)
 69
 70        # we don't save by default here. We'll save manually after validating the output
 71        run_output = await run_adapter.invoke(parsed_input)
 72
 73        eval_output, intermediate_outputs = await self.run_eval(run_output)
 74
 75        validate_schema_with_value_error(
 76            eval_output, self.score_schema, "Eval output does not match score schema."
 77        )
 78
 79        return run_output, eval_output, intermediate_outputs
 80
 81    @abstractmethod
 82    async def run_eval(
 83        self, task_run: TaskRun
 84    ) -> tuple[EvalScores, Dict[str, str] | None]:
 85        """
 86        Runs the eval on the given task run.
 87
 88        Returns a dictionary of scores which should conform to the score schema, and a dictionary of intermediate outputs (eval thinking).
 89        """
 90        pass
 91
 92    @classmethod
 93    def build_score_schema(cls, eval: Eval, allow_float_scores: bool = False) -> str:
 94        """
 95        Build a JSON schema for the scoring output of the task requirements
 96
 97        We allow 2 modes: allow_float_scores=True and allow_float_scores=False.
 98
 99        allow_float_scores=False is used for the call to the model, and forces the model into selecting into discrete rating options (int 1-5, pass-fail, etc).
100        allow_float_scores=True is used for final score output (for example, after we take a g-eval weighting of the model's logprobs). A pass/fail rating might return 0.75 for likely pass (as opposed to 0.99 for near certain pass), or a 1-5 score might return 3.75.
101        """
102
103        # Note: python maintains order, which is good as we want the user defined order, and overall last
104        properties = {}
105        for output_score in eval.output_scores:
106            output_score_json_key = output_score.json_key()
107
108            if len(output_score_json_key) == 0:
109                raise ValueError(
110                    f"Invalid output score name: {output_score.name}. Can not be used as JSON schema key."
111                )
112            property: dict[str, str | int | float | list[str] | list[int]] = {
113                "title": output_score.name,
114            }
115            match output_score.type:
116                case TaskOutputRatingType.five_star:
117                    if allow_float_scores:
118                        property["type"] = "number"
119                        property["minimum"] = 1
120                        property["maximum"] = 5
121                    else:
122                        property["type"] = "integer"
123                        property["minimum"] = 1
124                        property["maximum"] = 5
125
126                    property["description"] = (
127                        f"{output_score.instruction}\n\nThe rating should be between 1 and 5, with 1 being the worst and 5 being the best."
128                    )
129                case TaskOutputRatingType.pass_fail:
130                    if allow_float_scores:
131                        property["type"] = "number"
132                        property["minimum"] = 0
133                        property["maximum"] = 1
134                        property["description"] = (
135                            f"{output_score.instruction}\n\nThe rating should be between 0 and 1, with 0 being a failure and 1 being a pass."
136                        )
137                    else:
138                        property["enum"] = ["pass", "fail"]
139                        property["type"] = "string"
140                        property["description"] = (
141                            f"{output_score.instruction}\n\nThe rating should be either 'pass' or 'fail'."
142                        )
143                case TaskOutputRatingType.pass_fail_critical:
144                    if allow_float_scores:
145                        property["type"] = "number"
146                        property["minimum"] = -1
147                        property["maximum"] = 1
148                        property["description"] = (
149                            f"{output_score.instruction}\n\nThe rating should be between -1 and 1, with 1 being a pass, 0 being a failure, and -1 being a critical failure (very severe failure)."
150                        )
151                    else:
152                        property["enum"] = ["pass", "fail", "critical"]
153                        property["type"] = "string"
154                        property["description"] = (
155                            f"{output_score.instruction}\n\nThe rating should be either 'pass', 'fail', or 'critical' where critical a very severe failure."
156                        )
157                case TaskOutputRatingType.custom:
158                    # Skip custom rating types in evals
159                    continue
160                case _:
161                    raise_exhaustive_enum_error(output_score.type)
162
163            properties[output_score_json_key] = property
164
165        schema = {
166            "type": "object",
167            "properties": properties,
168            "required": list(properties.keys()),
169        }
170        return json.dumps(schema, ensure_ascii=False)
class BaseEval:
 15class BaseEval:
 16    """
 17    Base class for all evals/evaluators.
 18
 19    Should be subclassed, and the run_eval method implemented.
 20    """
 21
 22    def __init__(self, eval_config: EvalConfig, run_config: RunConfigProperties | None):
 23        self.eval_config = eval_config
 24        eval = eval_config.parent_eval()
 25        if not eval:
 26            raise ValueError("Eval config must have a parent eval")
 27        self.eval = eval
 28        task = self.eval.parent_task()
 29        if not task:
 30            raise ValueError("Eval must have a parent task")
 31        self.target_task = task
 32        self.score_schema = BaseEval.build_score_schema(eval, allow_float_scores=True)
 33        self.run_config = run_config
 34
 35    def model_and_provider(self) -> tuple[str, ModelProviderName]:
 36        model_name = self.eval_config.model_name
 37        provider = self.eval_config.model_provider
 38        if (
 39            not model_name
 40            or not provider
 41            or not isinstance(model_name, str)
 42            or not isinstance(provider, str)
 43            or provider not in ModelProviderName.__members__
 44        ):
 45            raise ValueError(
 46                "Model name and provider must be set in the eval config model properties"
 47            )
 48
 49        return model_name, ModelProviderName(provider)
 50
 51    async def run_task_and_eval(
 52        self, input: str
 53    ) -> tuple[TaskRun, EvalScores, Dict[str, str] | None]:
 54        """
 55        Runs the task on the provided run_config to generate fresh output, then runs the eval on that output.
 56        """
 57        if self.run_config is None:
 58            raise ValueError("Run config is required for run_task_and_eval")
 59
 60        run_adapter = adapter_for_task(
 61            self.target_task,
 62            self.run_config,
 63            base_adapter_config=AdapterConfig(allow_saving=False),
 64        )
 65
 66        # Parse structured input if needed
 67        parsed_input = input
 68        if self.target_task.input_json_schema is not None:
 69            parsed_input = json.loads(input)
 70
 71        # we don't save by default here. We'll save manually after validating the output
 72        run_output = await run_adapter.invoke(parsed_input)
 73
 74        eval_output, intermediate_outputs = await self.run_eval(run_output)
 75
 76        validate_schema_with_value_error(
 77            eval_output, self.score_schema, "Eval output does not match score schema."
 78        )
 79
 80        return run_output, eval_output, intermediate_outputs
 81
 82    @abstractmethod
 83    async def run_eval(
 84        self, task_run: TaskRun
 85    ) -> tuple[EvalScores, Dict[str, str] | None]:
 86        """
 87        Runs the eval on the given task run.
 88
 89        Returns a dictionary of scores which should conform to the score schema, and a dictionary of intermediate outputs (eval thinking).
 90        """
 91        pass
 92
 93    @classmethod
 94    def build_score_schema(cls, eval: Eval, allow_float_scores: bool = False) -> str:
 95        """
 96        Build a JSON schema for the scoring output of the task requirements
 97
 98        We allow 2 modes: allow_float_scores=True and allow_float_scores=False.
 99
100        allow_float_scores=False is used for the call to the model, and forces the model into selecting into discrete rating options (int 1-5, pass-fail, etc).
101        allow_float_scores=True is used for final score output (for example, after we take a g-eval weighting of the model's logprobs). A pass/fail rating might return 0.75 for likely pass (as opposed to 0.99 for near certain pass), or a 1-5 score might return 3.75.
102        """
103
104        # Note: python maintains order, which is good as we want the user defined order, and overall last
105        properties = {}
106        for output_score in eval.output_scores:
107            output_score_json_key = output_score.json_key()
108
109            if len(output_score_json_key) == 0:
110                raise ValueError(
111                    f"Invalid output score name: {output_score.name}. Can not be used as JSON schema key."
112                )
113            property: dict[str, str | int | float | list[str] | list[int]] = {
114                "title": output_score.name,
115            }
116            match output_score.type:
117                case TaskOutputRatingType.five_star:
118                    if allow_float_scores:
119                        property["type"] = "number"
120                        property["minimum"] = 1
121                        property["maximum"] = 5
122                    else:
123                        property["type"] = "integer"
124                        property["minimum"] = 1
125                        property["maximum"] = 5
126
127                    property["description"] = (
128                        f"{output_score.instruction}\n\nThe rating should be between 1 and 5, with 1 being the worst and 5 being the best."
129                    )
130                case TaskOutputRatingType.pass_fail:
131                    if allow_float_scores:
132                        property["type"] = "number"
133                        property["minimum"] = 0
134                        property["maximum"] = 1
135                        property["description"] = (
136                            f"{output_score.instruction}\n\nThe rating should be between 0 and 1, with 0 being a failure and 1 being a pass."
137                        )
138                    else:
139                        property["enum"] = ["pass", "fail"]
140                        property["type"] = "string"
141                        property["description"] = (
142                            f"{output_score.instruction}\n\nThe rating should be either 'pass' or 'fail'."
143                        )
144                case TaskOutputRatingType.pass_fail_critical:
145                    if allow_float_scores:
146                        property["type"] = "number"
147                        property["minimum"] = -1
148                        property["maximum"] = 1
149                        property["description"] = (
150                            f"{output_score.instruction}\n\nThe rating should be between -1 and 1, with 1 being a pass, 0 being a failure, and -1 being a critical failure (very severe failure)."
151                        )
152                    else:
153                        property["enum"] = ["pass", "fail", "critical"]
154                        property["type"] = "string"
155                        property["description"] = (
156                            f"{output_score.instruction}\n\nThe rating should be either 'pass', 'fail', or 'critical' where critical a very severe failure."
157                        )
158                case TaskOutputRatingType.custom:
159                    # Skip custom rating types in evals
160                    continue
161                case _:
162                    raise_exhaustive_enum_error(output_score.type)
163
164            properties[output_score_json_key] = property
165
166        schema = {
167            "type": "object",
168            "properties": properties,
169            "required": list(properties.keys()),
170        }
171        return json.dumps(schema, ensure_ascii=False)

Base class for all evals/evaluators.

Should be subclassed, and the run_eval method implemented.

BaseEval( eval_config: kiln_ai.datamodel.eval.EvalConfig, run_config: kiln_ai.datamodel.run_config.RunConfigProperties | None)
22    def __init__(self, eval_config: EvalConfig, run_config: RunConfigProperties | None):
23        self.eval_config = eval_config
24        eval = eval_config.parent_eval()
25        if not eval:
26            raise ValueError("Eval config must have a parent eval")
27        self.eval = eval
28        task = self.eval.parent_task()
29        if not task:
30            raise ValueError("Eval must have a parent task")
31        self.target_task = task
32        self.score_schema = BaseEval.build_score_schema(eval, allow_float_scores=True)
33        self.run_config = run_config
eval_config
eval
target_task
score_schema
run_config
def model_and_provider(self) -> tuple[str, kiln_ai.datamodel.datamodel_enums.ModelProviderName]:
35    def model_and_provider(self) -> tuple[str, ModelProviderName]:
36        model_name = self.eval_config.model_name
37        provider = self.eval_config.model_provider
38        if (
39            not model_name
40            or not provider
41            or not isinstance(model_name, str)
42            or not isinstance(provider, str)
43            or provider not in ModelProviderName.__members__
44        ):
45            raise ValueError(
46                "Model name and provider must be set in the eval config model properties"
47            )
48
49        return model_name, ModelProviderName(provider)
async def run_task_and_eval( self, input: str) -> tuple[kiln_ai.datamodel.TaskRun, typing.Dict[str, float], typing.Optional[typing.Dict[str, str]]]:
51    async def run_task_and_eval(
52        self, input: str
53    ) -> tuple[TaskRun, EvalScores, Dict[str, str] | None]:
54        """
55        Runs the task on the provided run_config to generate fresh output, then runs the eval on that output.
56        """
57        if self.run_config is None:
58            raise ValueError("Run config is required for run_task_and_eval")
59
60        run_adapter = adapter_for_task(
61            self.target_task,
62            self.run_config,
63            base_adapter_config=AdapterConfig(allow_saving=False),
64        )
65
66        # Parse structured input if needed
67        parsed_input = input
68        if self.target_task.input_json_schema is not None:
69            parsed_input = json.loads(input)
70
71        # we don't save by default here. We'll save manually after validating the output
72        run_output = await run_adapter.invoke(parsed_input)
73
74        eval_output, intermediate_outputs = await self.run_eval(run_output)
75
76        validate_schema_with_value_error(
77            eval_output, self.score_schema, "Eval output does not match score schema."
78        )
79
80        return run_output, eval_output, intermediate_outputs

Runs the task on the provided run_config to generate fresh output, then runs the eval on that output.

@abstractmethod
async def run_eval( self, task_run: kiln_ai.datamodel.TaskRun) -> tuple[typing.Dict[str, float], typing.Optional[typing.Dict[str, str]]]:
82    @abstractmethod
83    async def run_eval(
84        self, task_run: TaskRun
85    ) -> tuple[EvalScores, Dict[str, str] | None]:
86        """
87        Runs the eval on the given task run.
88
89        Returns a dictionary of scores which should conform to the score schema, and a dictionary of intermediate outputs (eval thinking).
90        """
91        pass

Runs the eval on the given task run.

Returns a dictionary of scores which should conform to the score schema, and a dictionary of intermediate outputs (eval thinking).

@classmethod
def build_score_schema( cls, eval: kiln_ai.datamodel.eval.Eval, allow_float_scores: bool = False) -> str:
 93    @classmethod
 94    def build_score_schema(cls, eval: Eval, allow_float_scores: bool = False) -> str:
 95        """
 96        Build a JSON schema for the scoring output of the task requirements
 97
 98        We allow 2 modes: allow_float_scores=True and allow_float_scores=False.
 99
100        allow_float_scores=False is used for the call to the model, and forces the model into selecting into discrete rating options (int 1-5, pass-fail, etc).
101        allow_float_scores=True is used for final score output (for example, after we take a g-eval weighting of the model's logprobs). A pass/fail rating might return 0.75 for likely pass (as opposed to 0.99 for near certain pass), or a 1-5 score might return 3.75.
102        """
103
104        # Note: python maintains order, which is good as we want the user defined order, and overall last
105        properties = {}
106        for output_score in eval.output_scores:
107            output_score_json_key = output_score.json_key()
108
109            if len(output_score_json_key) == 0:
110                raise ValueError(
111                    f"Invalid output score name: {output_score.name}. Can not be used as JSON schema key."
112                )
113            property: dict[str, str | int | float | list[str] | list[int]] = {
114                "title": output_score.name,
115            }
116            match output_score.type:
117                case TaskOutputRatingType.five_star:
118                    if allow_float_scores:
119                        property["type"] = "number"
120                        property["minimum"] = 1
121                        property["maximum"] = 5
122                    else:
123                        property["type"] = "integer"
124                        property["minimum"] = 1
125                        property["maximum"] = 5
126
127                    property["description"] = (
128                        f"{output_score.instruction}\n\nThe rating should be between 1 and 5, with 1 being the worst and 5 being the best."
129                    )
130                case TaskOutputRatingType.pass_fail:
131                    if allow_float_scores:
132                        property["type"] = "number"
133                        property["minimum"] = 0
134                        property["maximum"] = 1
135                        property["description"] = (
136                            f"{output_score.instruction}\n\nThe rating should be between 0 and 1, with 0 being a failure and 1 being a pass."
137                        )
138                    else:
139                        property["enum"] = ["pass", "fail"]
140                        property["type"] = "string"
141                        property["description"] = (
142                            f"{output_score.instruction}\n\nThe rating should be either 'pass' or 'fail'."
143                        )
144                case TaskOutputRatingType.pass_fail_critical:
145                    if allow_float_scores:
146                        property["type"] = "number"
147                        property["minimum"] = -1
148                        property["maximum"] = 1
149                        property["description"] = (
150                            f"{output_score.instruction}\n\nThe rating should be between -1 and 1, with 1 being a pass, 0 being a failure, and -1 being a critical failure (very severe failure)."
151                        )
152                    else:
153                        property["enum"] = ["pass", "fail", "critical"]
154                        property["type"] = "string"
155                        property["description"] = (
156                            f"{output_score.instruction}\n\nThe rating should be either 'pass', 'fail', or 'critical' where critical a very severe failure."
157                        )
158                case TaskOutputRatingType.custom:
159                    # Skip custom rating types in evals
160                    continue
161                case _:
162                    raise_exhaustive_enum_error(output_score.type)
163
164            properties[output_score_json_key] = property
165
166        schema = {
167            "type": "object",
168            "properties": properties,
169            "required": list(properties.keys()),
170        }
171        return json.dumps(schema, ensure_ascii=False)

Build a JSON schema for the scoring output of the task requirements

We allow 2 modes: allow_float_scores=True and allow_float_scores=False.

allow_float_scores=False is used for the call to the model, and forces the model into selecting into discrete rating options (int 1-5, pass-fail, etc). allow_float_scores=True is used for final score output (for example, after we take a g-eval weighting of the model's logprobs). A pass/fail rating might return 0.75 for likely pass (as opposed to 0.99 for near certain pass), or a 1-5 score might return 3.75.