kiln_ai.adapters.eval.base_eval

  1import json
  2from abc import abstractmethod
  3from typing import Dict
  4
  5import jsonschema
  6
  7from kiln_ai.adapters.adapter_registry import adapter_for_task
  8from kiln_ai.adapters.ml_model_list import ModelProviderName
  9from kiln_ai.adapters.model_adapters.base_adapter import AdapterConfig
 10from kiln_ai.datamodel.eval import Eval, EvalConfig, EvalScores
 11from kiln_ai.datamodel.json_schema import validate_schema_with_value_error
 12from kiln_ai.datamodel.task import RunConfig, TaskOutputRatingType, TaskRun
 13from kiln_ai.utils.exhaustive_error import raise_exhaustive_enum_error
 14
 15
 16class BaseEval:
 17    """
 18    Base class for all evals/evaluators.
 19
 20    Should be subclassed, and the run_eval method implemented.
 21    """
 22
 23    def __init__(self, eval_config: EvalConfig, run_config: RunConfig | None):
 24        self.eval_config = eval_config
 25        eval = eval_config.parent_eval()
 26        if not eval:
 27            raise ValueError("Eval config must have a parent eval")
 28        self.eval = eval
 29        task = self.eval.parent_task()
 30        if not task:
 31            raise ValueError("Eval must have a parent task")
 32        self.target_task = task
 33        self.score_schema = BaseEval.build_score_schema(eval, allow_float_scores=True)
 34        self.run_config = run_config
 35
 36    def model_and_provider(self) -> tuple[str, ModelProviderName]:
 37        model_name = self.eval_config.model_name
 38        provider = self.eval_config.model_provider
 39        if (
 40            not model_name
 41            or not provider
 42            or not isinstance(model_name, str)
 43            or not isinstance(provider, str)
 44            or provider not in ModelProviderName.__members__
 45        ):
 46            raise ValueError(
 47                "Model name and provider must be set in the eval config model properties"
 48            )
 49
 50        return model_name, ModelProviderName(provider)
 51
 52    async def run_task_and_eval(
 53        self, input: str
 54    ) -> tuple[TaskRun, EvalScores, Dict[str, str] | None]:
 55        """
 56        Runs the task on the provided run_config to generate fresh output, then runs the eval on that output.
 57        """
 58        if self.run_config is None:
 59            raise ValueError("Run config is required for run_task_and_eval")
 60
 61        run_adapter = adapter_for_task(
 62            self.target_task,
 63            self.run_config.model_name,
 64            ModelProviderName(self.run_config.model_provider_name),
 65            base_adapter_config=AdapterConfig(allow_saving=False),
 66        )
 67
 68        # Parse structured input if needed
 69        parsed_input = input
 70        if self.target_task.output_json_schema is not None:
 71            parsed_input = json.loads(input)
 72
 73        # we don't save by default here. We'll save manually after validating the output
 74        run_output = await run_adapter.invoke(parsed_input)
 75
 76        eval_output, intermediate_outputs = await self.run_eval(run_output)
 77
 78        validate_schema_with_value_error(
 79            eval_output, self.score_schema, "Eval output does not match score schema."
 80        )
 81
 82        return run_output, eval_output, intermediate_outputs
 83
 84    @abstractmethod
 85    async def run_eval(
 86        self, task_run: TaskRun
 87    ) -> tuple[EvalScores, Dict[str, str] | None]:
 88        """
 89        Runs the eval on the given task run.
 90
 91        Returns a dictionary of scores which should conform to the score schema, and a dictionary of intermediate outputs (eval thinking).
 92        """
 93        pass
 94
 95    @classmethod
 96    def build_score_schema(cls, eval: Eval, allow_float_scores: bool = False) -> str:
 97        """
 98        Build a JSON schema for the scoring output of the task requirements
 99
100        We allow 2 modes: allow_float_scores=True and allow_float_scores=False.
101
102        allow_float_scores=False is used for the call to the model, and forces the model into selecting into discrete rating options (int 1-5, pass-fail, etc).
103        allow_float_scores=True is used for final score output (for example, after we take a g-eval weighting of the model's logprobs). A pass/fail rating might return 0.75 for likely pass (as opposed to 0.99 for near certain pass), or a 1-5 score might return 3.75.
104        """
105
106        # Note: python maintains order, which is good as we want the user defined order, and overall last
107        properties = {}
108        for output_score in eval.output_scores:
109            output_score_json_key = output_score.json_key()
110
111            if len(output_score_json_key) == 0:
112                raise ValueError(
113                    f"Invalid output score name: {output_score.name}. Can not be used as JSON schema key."
114                )
115            property: dict[str, str | int | float | list[str] | list[int]] = {
116                "title": output_score.name,
117            }
118            match output_score.type:
119                case TaskOutputRatingType.five_star:
120                    if allow_float_scores:
121                        property["type"] = "number"
122                        property["minimum"] = 1
123                        property["maximum"] = 5
124                    else:
125                        property["enum"] = [1, 2, 3, 4, 5]
126
127                    property["description"] = (
128                        f"{output_score.instruction}\n\nThe rating should be between 1 and 5, with 1 being the worst and 5 being the best."
129                    )
130                case TaskOutputRatingType.pass_fail:
131                    if allow_float_scores:
132                        property["type"] = "number"
133                        property["minimum"] = 0
134                        property["maximum"] = 1
135                        property["description"] = (
136                            f"{output_score.instruction}\n\nThe rating should be between 0 and 1, with 0 being a failure and 1 being a pass."
137                        )
138                    else:
139                        property["enum"] = ["pass", "fail"]
140                        property["description"] = (
141                            f"{output_score.instruction}\n\nThe rating should be either 'pass' or 'fail'."
142                        )
143                case TaskOutputRatingType.pass_fail_critical:
144                    if allow_float_scores:
145                        property["type"] = "number"
146                        property["minimum"] = -1
147                        property["maximum"] = 1
148                        property["description"] = (
149                            f"{output_score.instruction}\n\nThe rating should be between -1 and 1, with 1 being a pass, 0 being a failure, and -1 being a critical failure (very severe failure)."
150                        )
151                    else:
152                        property["enum"] = ["pass", "fail", "critical"]
153                        property["description"] = (
154                            f"{output_score.instruction}\n\nThe rating should be either 'pass', 'fail', or 'critical' where critical a very severe failure."
155                        )
156                case TaskOutputRatingType.custom:
157                    # Skip custom rating types in evals
158                    continue
159                case _:
160                    raise_exhaustive_enum_error(output_score.type)
161
162            properties[output_score_json_key] = property
163
164        schema = {
165            "type": "object",
166            "properties": properties,
167            "required": list(properties.keys()),
168        }
169        return json.dumps(schema, ensure_ascii=False)
class BaseEval:
 17class BaseEval:
 18    """
 19    Base class for all evals/evaluators.
 20
 21    Should be subclassed, and the run_eval method implemented.
 22    """
 23
 24    def __init__(self, eval_config: EvalConfig, run_config: RunConfig | None):
 25        self.eval_config = eval_config
 26        eval = eval_config.parent_eval()
 27        if not eval:
 28            raise ValueError("Eval config must have a parent eval")
 29        self.eval = eval
 30        task = self.eval.parent_task()
 31        if not task:
 32            raise ValueError("Eval must have a parent task")
 33        self.target_task = task
 34        self.score_schema = BaseEval.build_score_schema(eval, allow_float_scores=True)
 35        self.run_config = run_config
 36
 37    def model_and_provider(self) -> tuple[str, ModelProviderName]:
 38        model_name = self.eval_config.model_name
 39        provider = self.eval_config.model_provider
 40        if (
 41            not model_name
 42            or not provider
 43            or not isinstance(model_name, str)
 44            or not isinstance(provider, str)
 45            or provider not in ModelProviderName.__members__
 46        ):
 47            raise ValueError(
 48                "Model name and provider must be set in the eval config model properties"
 49            )
 50
 51        return model_name, ModelProviderName(provider)
 52
 53    async def run_task_and_eval(
 54        self, input: str
 55    ) -> tuple[TaskRun, EvalScores, Dict[str, str] | None]:
 56        """
 57        Runs the task on the provided run_config to generate fresh output, then runs the eval on that output.
 58        """
 59        if self.run_config is None:
 60            raise ValueError("Run config is required for run_task_and_eval")
 61
 62        run_adapter = adapter_for_task(
 63            self.target_task,
 64            self.run_config.model_name,
 65            ModelProviderName(self.run_config.model_provider_name),
 66            base_adapter_config=AdapterConfig(allow_saving=False),
 67        )
 68
 69        # Parse structured input if needed
 70        parsed_input = input
 71        if self.target_task.output_json_schema is not None:
 72            parsed_input = json.loads(input)
 73
 74        # we don't save by default here. We'll save manually after validating the output
 75        run_output = await run_adapter.invoke(parsed_input)
 76
 77        eval_output, intermediate_outputs = await self.run_eval(run_output)
 78
 79        validate_schema_with_value_error(
 80            eval_output, self.score_schema, "Eval output does not match score schema."
 81        )
 82
 83        return run_output, eval_output, intermediate_outputs
 84
 85    @abstractmethod
 86    async def run_eval(
 87        self, task_run: TaskRun
 88    ) -> tuple[EvalScores, Dict[str, str] | None]:
 89        """
 90        Runs the eval on the given task run.
 91
 92        Returns a dictionary of scores which should conform to the score schema, and a dictionary of intermediate outputs (eval thinking).
 93        """
 94        pass
 95
 96    @classmethod
 97    def build_score_schema(cls, eval: Eval, allow_float_scores: bool = False) -> str:
 98        """
 99        Build a JSON schema for the scoring output of the task requirements
100
101        We allow 2 modes: allow_float_scores=True and allow_float_scores=False.
102
103        allow_float_scores=False is used for the call to the model, and forces the model into selecting into discrete rating options (int 1-5, pass-fail, etc).
104        allow_float_scores=True is used for final score output (for example, after we take a g-eval weighting of the model's logprobs). A pass/fail rating might return 0.75 for likely pass (as opposed to 0.99 for near certain pass), or a 1-5 score might return 3.75.
105        """
106
107        # Note: python maintains order, which is good as we want the user defined order, and overall last
108        properties = {}
109        for output_score in eval.output_scores:
110            output_score_json_key = output_score.json_key()
111
112            if len(output_score_json_key) == 0:
113                raise ValueError(
114                    f"Invalid output score name: {output_score.name}. Can not be used as JSON schema key."
115                )
116            property: dict[str, str | int | float | list[str] | list[int]] = {
117                "title": output_score.name,
118            }
119            match output_score.type:
120                case TaskOutputRatingType.five_star:
121                    if allow_float_scores:
122                        property["type"] = "number"
123                        property["minimum"] = 1
124                        property["maximum"] = 5
125                    else:
126                        property["enum"] = [1, 2, 3, 4, 5]
127
128                    property["description"] = (
129                        f"{output_score.instruction}\n\nThe rating should be between 1 and 5, with 1 being the worst and 5 being the best."
130                    )
131                case TaskOutputRatingType.pass_fail:
132                    if allow_float_scores:
133                        property["type"] = "number"
134                        property["minimum"] = 0
135                        property["maximum"] = 1
136                        property["description"] = (
137                            f"{output_score.instruction}\n\nThe rating should be between 0 and 1, with 0 being a failure and 1 being a pass."
138                        )
139                    else:
140                        property["enum"] = ["pass", "fail"]
141                        property["description"] = (
142                            f"{output_score.instruction}\n\nThe rating should be either 'pass' or 'fail'."
143                        )
144                case TaskOutputRatingType.pass_fail_critical:
145                    if allow_float_scores:
146                        property["type"] = "number"
147                        property["minimum"] = -1
148                        property["maximum"] = 1
149                        property["description"] = (
150                            f"{output_score.instruction}\n\nThe rating should be between -1 and 1, with 1 being a pass, 0 being a failure, and -1 being a critical failure (very severe failure)."
151                        )
152                    else:
153                        property["enum"] = ["pass", "fail", "critical"]
154                        property["description"] = (
155                            f"{output_score.instruction}\n\nThe rating should be either 'pass', 'fail', or 'critical' where critical a very severe failure."
156                        )
157                case TaskOutputRatingType.custom:
158                    # Skip custom rating types in evals
159                    continue
160                case _:
161                    raise_exhaustive_enum_error(output_score.type)
162
163            properties[output_score_json_key] = property
164
165        schema = {
166            "type": "object",
167            "properties": properties,
168            "required": list(properties.keys()),
169        }
170        return json.dumps(schema, ensure_ascii=False)

Base class for all evals/evaluators.

Should be subclassed, and the run_eval method implemented.

BaseEval( eval_config: kiln_ai.datamodel.eval.EvalConfig, run_config: kiln_ai.datamodel.task.RunConfig | None)
24    def __init__(self, eval_config: EvalConfig, run_config: RunConfig | None):
25        self.eval_config = eval_config
26        eval = eval_config.parent_eval()
27        if not eval:
28            raise ValueError("Eval config must have a parent eval")
29        self.eval = eval
30        task = self.eval.parent_task()
31        if not task:
32            raise ValueError("Eval must have a parent task")
33        self.target_task = task
34        self.score_schema = BaseEval.build_score_schema(eval, allow_float_scores=True)
35        self.run_config = run_config
eval_config
eval
target_task
score_schema
run_config
def model_and_provider(self) -> tuple[str, kiln_ai.adapters.ml_model_list.ModelProviderName]:
37    def model_and_provider(self) -> tuple[str, ModelProviderName]:
38        model_name = self.eval_config.model_name
39        provider = self.eval_config.model_provider
40        if (
41            not model_name
42            or not provider
43            or not isinstance(model_name, str)
44            or not isinstance(provider, str)
45            or provider not in ModelProviderName.__members__
46        ):
47            raise ValueError(
48                "Model name and provider must be set in the eval config model properties"
49            )
50
51        return model_name, ModelProviderName(provider)
async def run_task_and_eval( self, input: str) -> tuple[kiln_ai.datamodel.TaskRun, typing.Dict[str, float], typing.Optional[typing.Dict[str, str]]]:
53    async def run_task_and_eval(
54        self, input: str
55    ) -> tuple[TaskRun, EvalScores, Dict[str, str] | None]:
56        """
57        Runs the task on the provided run_config to generate fresh output, then runs the eval on that output.
58        """
59        if self.run_config is None:
60            raise ValueError("Run config is required for run_task_and_eval")
61
62        run_adapter = adapter_for_task(
63            self.target_task,
64            self.run_config.model_name,
65            ModelProviderName(self.run_config.model_provider_name),
66            base_adapter_config=AdapterConfig(allow_saving=False),
67        )
68
69        # Parse structured input if needed
70        parsed_input = input
71        if self.target_task.output_json_schema is not None:
72            parsed_input = json.loads(input)
73
74        # we don't save by default here. We'll save manually after validating the output
75        run_output = await run_adapter.invoke(parsed_input)
76
77        eval_output, intermediate_outputs = await self.run_eval(run_output)
78
79        validate_schema_with_value_error(
80            eval_output, self.score_schema, "Eval output does not match score schema."
81        )
82
83        return run_output, eval_output, intermediate_outputs

Runs the task on the provided run_config to generate fresh output, then runs the eval on that output.

@abstractmethod
async def run_eval( self, task_run: kiln_ai.datamodel.TaskRun) -> tuple[typing.Dict[str, float], typing.Optional[typing.Dict[str, str]]]:
85    @abstractmethod
86    async def run_eval(
87        self, task_run: TaskRun
88    ) -> tuple[EvalScores, Dict[str, str] | None]:
89        """
90        Runs the eval on the given task run.
91
92        Returns a dictionary of scores which should conform to the score schema, and a dictionary of intermediate outputs (eval thinking).
93        """
94        pass

Runs the eval on the given task run.

Returns a dictionary of scores which should conform to the score schema, and a dictionary of intermediate outputs (eval thinking).

@classmethod
def build_score_schema( cls, eval: kiln_ai.datamodel.eval.Eval, allow_float_scores: bool = False) -> str:
 96    @classmethod
 97    def build_score_schema(cls, eval: Eval, allow_float_scores: bool = False) -> str:
 98        """
 99        Build a JSON schema for the scoring output of the task requirements
100
101        We allow 2 modes: allow_float_scores=True and allow_float_scores=False.
102
103        allow_float_scores=False is used for the call to the model, and forces the model into selecting into discrete rating options (int 1-5, pass-fail, etc).
104        allow_float_scores=True is used for final score output (for example, after we take a g-eval weighting of the model's logprobs). A pass/fail rating might return 0.75 for likely pass (as opposed to 0.99 for near certain pass), or a 1-5 score might return 3.75.
105        """
106
107        # Note: python maintains order, which is good as we want the user defined order, and overall last
108        properties = {}
109        for output_score in eval.output_scores:
110            output_score_json_key = output_score.json_key()
111
112            if len(output_score_json_key) == 0:
113                raise ValueError(
114                    f"Invalid output score name: {output_score.name}. Can not be used as JSON schema key."
115                )
116            property: dict[str, str | int | float | list[str] | list[int]] = {
117                "title": output_score.name,
118            }
119            match output_score.type:
120                case TaskOutputRatingType.five_star:
121                    if allow_float_scores:
122                        property["type"] = "number"
123                        property["minimum"] = 1
124                        property["maximum"] = 5
125                    else:
126                        property["enum"] = [1, 2, 3, 4, 5]
127
128                    property["description"] = (
129                        f"{output_score.instruction}\n\nThe rating should be between 1 and 5, with 1 being the worst and 5 being the best."
130                    )
131                case TaskOutputRatingType.pass_fail:
132                    if allow_float_scores:
133                        property["type"] = "number"
134                        property["minimum"] = 0
135                        property["maximum"] = 1
136                        property["description"] = (
137                            f"{output_score.instruction}\n\nThe rating should be between 0 and 1, with 0 being a failure and 1 being a pass."
138                        )
139                    else:
140                        property["enum"] = ["pass", "fail"]
141                        property["description"] = (
142                            f"{output_score.instruction}\n\nThe rating should be either 'pass' or 'fail'."
143                        )
144                case TaskOutputRatingType.pass_fail_critical:
145                    if allow_float_scores:
146                        property["type"] = "number"
147                        property["minimum"] = -1
148                        property["maximum"] = 1
149                        property["description"] = (
150                            f"{output_score.instruction}\n\nThe rating should be between -1 and 1, with 1 being a pass, 0 being a failure, and -1 being a critical failure (very severe failure)."
151                        )
152                    else:
153                        property["enum"] = ["pass", "fail", "critical"]
154                        property["description"] = (
155                            f"{output_score.instruction}\n\nThe rating should be either 'pass', 'fail', or 'critical' where critical a very severe failure."
156                        )
157                case TaskOutputRatingType.custom:
158                    # Skip custom rating types in evals
159                    continue
160                case _:
161                    raise_exhaustive_enum_error(output_score.type)
162
163            properties[output_score_json_key] = property
164
165        schema = {
166            "type": "object",
167            "properties": properties,
168            "required": list(properties.keys()),
169        }
170        return json.dumps(schema, ensure_ascii=False)

Build a JSON schema for the scoring output of the task requirements

We allow 2 modes: allow_float_scores=True and allow_float_scores=False.

allow_float_scores=False is used for the call to the model, and forces the model into selecting into discrete rating options (int 1-5, pass-fail, etc). allow_float_scores=True is used for final score output (for example, after we take a g-eval weighting of the model's logprobs). A pass/fail rating might return 0.75 for likely pass (as opposed to 0.99 for near certain pass), or a 1-5 score might return 3.75.