kiln_ai.adapters.eval.base_eval

  1import json
  2from abc import abstractmethod
  3from typing import Dict
  4
  5from kiln_ai.adapters.adapter_registry import adapter_for_task
  6from kiln_ai.adapters.ml_model_list import ModelProviderName
  7from kiln_ai.adapters.model_adapters.base_adapter import AdapterConfig
  8from kiln_ai.datamodel.eval import Eval, EvalConfig, EvalScores
  9from kiln_ai.datamodel.json_schema import validate_schema
 10from kiln_ai.datamodel.task import RunConfig, TaskOutputRatingType, TaskRun
 11from kiln_ai.utils.exhaustive_error import raise_exhaustive_enum_error
 12
 13
 14class BaseEval:
 15    """
 16    Base class for all evals/evaluators.
 17
 18    Should be subclassed, and the run_eval method implemented.
 19    """
 20
 21    def __init__(self, eval_config: EvalConfig, run_config: RunConfig | None):
 22        self.eval_config = eval_config
 23        eval = eval_config.parent_eval()
 24        if not eval:
 25            raise ValueError("Eval config must have a parent eval")
 26        self.eval = eval
 27        task = self.eval.parent_task()
 28        if not task:
 29            raise ValueError("Eval must have a parent task")
 30        self.target_task = task
 31        self.score_schema = BaseEval.build_score_schema(eval, allow_float_scores=True)
 32        self.run_config = run_config
 33
 34    def model_and_provider(self) -> tuple[str, ModelProviderName]:
 35        model_name = self.eval_config.model_name
 36        provider = self.eval_config.model_provider
 37        if (
 38            not model_name
 39            or not provider
 40            or not isinstance(model_name, str)
 41            or not isinstance(provider, str)
 42            or provider not in ModelProviderName.__members__
 43        ):
 44            raise ValueError(
 45                "Model name and provider must be set in the eval config model properties"
 46            )
 47
 48        return model_name, ModelProviderName(provider)
 49
 50    async def run_task_and_eval(
 51        self, input: str
 52    ) -> tuple[TaskRun, EvalScores, Dict[str, str] | None]:
 53        """
 54        Runs the task on the provided run_config to generate fresh output, then runs the eval on that output.
 55        """
 56        if self.run_config is None:
 57            raise ValueError("Run config is required for run_task_and_eval")
 58
 59        run_adapter = adapter_for_task(
 60            self.target_task,
 61            self.run_config.model_name,
 62            ModelProviderName(self.run_config.model_provider_name),
 63            base_adapter_config=AdapterConfig(allow_saving=False),
 64        )
 65
 66        # Parse structured input if needed
 67        parsed_input = input
 68        if self.target_task.output_json_schema is not None:
 69            parsed_input = json.loads(input)
 70
 71        # we don't save by default here. We'll save manually after validating the output
 72        run_output = await run_adapter.invoke(parsed_input)
 73
 74        eval_output, intermediate_outputs = await self.run_eval(run_output)
 75        validate_schema(eval_output, self.score_schema)
 76
 77        return run_output, eval_output, intermediate_outputs
 78
 79    @abstractmethod
 80    async def run_eval(
 81        self, task_run: TaskRun
 82    ) -> tuple[EvalScores, Dict[str, str] | None]:
 83        """
 84        Runs the eval on the given task run.
 85
 86        Returns a dictionary of scores which should conform to the score schema, and a dictionary of intermediate outputs (eval thinking).
 87        """
 88        pass
 89
 90    @classmethod
 91    def build_score_schema(cls, eval: Eval, allow_float_scores: bool = False) -> str:
 92        """
 93        Build a JSON schema for the scoring output of the task requirements
 94
 95        We allow 2 modes: allow_float_scores=True and allow_float_scores=False.
 96
 97        allow_float_scores=False is used for the call to the model, and forces the model into selecting into discrete rating options (int 1-5, pass-fail, etc).
 98        allow_float_scores=True is used for final score output (for example, after we take a g-eval weighting of the model's logprobs). A pass/fail rating might return 0.75 for likely pass (as opposed to 0.99 for near certain pass), or a 1-5 score might return 3.75.
 99        """
100
101        # Note: python maintains order, which is good as we want the user defined order, and overall last
102        properties = {}
103        for output_score in eval.output_scores:
104            output_score_json_key = output_score.json_key()
105
106            if len(output_score_json_key) == 0:
107                raise ValueError(
108                    f"Invalid output score name: {output_score.name}. Can not be used as JSON schema key."
109                )
110            property: dict[str, str | int | float | list[str] | list[int]] = {
111                "title": output_score.name,
112            }
113            match output_score.type:
114                case TaskOutputRatingType.five_star:
115                    if allow_float_scores:
116                        property["type"] = "number"
117                        property["minimum"] = 1
118                        property["maximum"] = 5
119                    else:
120                        property["enum"] = [1, 2, 3, 4, 5]
121
122                    property["description"] = (
123                        f"{output_score.instruction}\n\nThe rating should be between 1 and 5, with 1 being the worst and 5 being the best."
124                    )
125                case TaskOutputRatingType.pass_fail:
126                    if allow_float_scores:
127                        property["type"] = "number"
128                        property["minimum"] = 0
129                        property["maximum"] = 1
130                        property["description"] = (
131                            f"{output_score.instruction}\n\nThe rating should be between 0 and 1, with 0 being a failure and 1 being a pass."
132                        )
133                    else:
134                        property["enum"] = ["pass", "fail"]
135                        property["description"] = (
136                            f"{output_score.instruction}\n\nThe rating should be either 'pass' or 'fail'."
137                        )
138                case TaskOutputRatingType.pass_fail_critical:
139                    if allow_float_scores:
140                        property["type"] = "number"
141                        property["minimum"] = -1
142                        property["maximum"] = 1
143                        property["description"] = (
144                            f"{output_score.instruction}\n\nThe rating should be between -1 and 1, with 1 being a pass, 0 being a failure, and -1 being a critical failure (very severe failure)."
145                        )
146                    else:
147                        property["enum"] = ["pass", "fail", "critical"]
148                        property["description"] = (
149                            f"{output_score.instruction}\n\nThe rating should be either 'pass', 'fail', or 'critical' where critical a very severe failure."
150                        )
151                case TaskOutputRatingType.custom:
152                    # Skip custom rating types in evals
153                    continue
154                case _:
155                    raise_exhaustive_enum_error(output_score.type)
156
157            properties[output_score_json_key] = property
158
159        schema = {
160            "type": "object",
161            "properties": properties,
162            "required": list(properties.keys()),
163        }
164        return json.dumps(schema, ensure_ascii=False)
class BaseEval:
 15class BaseEval:
 16    """
 17    Base class for all evals/evaluators.
 18
 19    Should be subclassed, and the run_eval method implemented.
 20    """
 21
 22    def __init__(self, eval_config: EvalConfig, run_config: RunConfig | None):
 23        self.eval_config = eval_config
 24        eval = eval_config.parent_eval()
 25        if not eval:
 26            raise ValueError("Eval config must have a parent eval")
 27        self.eval = eval
 28        task = self.eval.parent_task()
 29        if not task:
 30            raise ValueError("Eval must have a parent task")
 31        self.target_task = task
 32        self.score_schema = BaseEval.build_score_schema(eval, allow_float_scores=True)
 33        self.run_config = run_config
 34
 35    def model_and_provider(self) -> tuple[str, ModelProviderName]:
 36        model_name = self.eval_config.model_name
 37        provider = self.eval_config.model_provider
 38        if (
 39            not model_name
 40            or not provider
 41            or not isinstance(model_name, str)
 42            or not isinstance(provider, str)
 43            or provider not in ModelProviderName.__members__
 44        ):
 45            raise ValueError(
 46                "Model name and provider must be set in the eval config model properties"
 47            )
 48
 49        return model_name, ModelProviderName(provider)
 50
 51    async def run_task_and_eval(
 52        self, input: str
 53    ) -> tuple[TaskRun, EvalScores, Dict[str, str] | None]:
 54        """
 55        Runs the task on the provided run_config to generate fresh output, then runs the eval on that output.
 56        """
 57        if self.run_config is None:
 58            raise ValueError("Run config is required for run_task_and_eval")
 59
 60        run_adapter = adapter_for_task(
 61            self.target_task,
 62            self.run_config.model_name,
 63            ModelProviderName(self.run_config.model_provider_name),
 64            base_adapter_config=AdapterConfig(allow_saving=False),
 65        )
 66
 67        # Parse structured input if needed
 68        parsed_input = input
 69        if self.target_task.output_json_schema is not None:
 70            parsed_input = json.loads(input)
 71
 72        # we don't save by default here. We'll save manually after validating the output
 73        run_output = await run_adapter.invoke(parsed_input)
 74
 75        eval_output, intermediate_outputs = await self.run_eval(run_output)
 76        validate_schema(eval_output, self.score_schema)
 77
 78        return run_output, eval_output, intermediate_outputs
 79
 80    @abstractmethod
 81    async def run_eval(
 82        self, task_run: TaskRun
 83    ) -> tuple[EvalScores, Dict[str, str] | None]:
 84        """
 85        Runs the eval on the given task run.
 86
 87        Returns a dictionary of scores which should conform to the score schema, and a dictionary of intermediate outputs (eval thinking).
 88        """
 89        pass
 90
 91    @classmethod
 92    def build_score_schema(cls, eval: Eval, allow_float_scores: bool = False) -> str:
 93        """
 94        Build a JSON schema for the scoring output of the task requirements
 95
 96        We allow 2 modes: allow_float_scores=True and allow_float_scores=False.
 97
 98        allow_float_scores=False is used for the call to the model, and forces the model into selecting into discrete rating options (int 1-5, pass-fail, etc).
 99        allow_float_scores=True is used for final score output (for example, after we take a g-eval weighting of the model's logprobs). A pass/fail rating might return 0.75 for likely pass (as opposed to 0.99 for near certain pass), or a 1-5 score might return 3.75.
100        """
101
102        # Note: python maintains order, which is good as we want the user defined order, and overall last
103        properties = {}
104        for output_score in eval.output_scores:
105            output_score_json_key = output_score.json_key()
106
107            if len(output_score_json_key) == 0:
108                raise ValueError(
109                    f"Invalid output score name: {output_score.name}. Can not be used as JSON schema key."
110                )
111            property: dict[str, str | int | float | list[str] | list[int]] = {
112                "title": output_score.name,
113            }
114            match output_score.type:
115                case TaskOutputRatingType.five_star:
116                    if allow_float_scores:
117                        property["type"] = "number"
118                        property["minimum"] = 1
119                        property["maximum"] = 5
120                    else:
121                        property["enum"] = [1, 2, 3, 4, 5]
122
123                    property["description"] = (
124                        f"{output_score.instruction}\n\nThe rating should be between 1 and 5, with 1 being the worst and 5 being the best."
125                    )
126                case TaskOutputRatingType.pass_fail:
127                    if allow_float_scores:
128                        property["type"] = "number"
129                        property["minimum"] = 0
130                        property["maximum"] = 1
131                        property["description"] = (
132                            f"{output_score.instruction}\n\nThe rating should be between 0 and 1, with 0 being a failure and 1 being a pass."
133                        )
134                    else:
135                        property["enum"] = ["pass", "fail"]
136                        property["description"] = (
137                            f"{output_score.instruction}\n\nThe rating should be either 'pass' or 'fail'."
138                        )
139                case TaskOutputRatingType.pass_fail_critical:
140                    if allow_float_scores:
141                        property["type"] = "number"
142                        property["minimum"] = -1
143                        property["maximum"] = 1
144                        property["description"] = (
145                            f"{output_score.instruction}\n\nThe rating should be between -1 and 1, with 1 being a pass, 0 being a failure, and -1 being a critical failure (very severe failure)."
146                        )
147                    else:
148                        property["enum"] = ["pass", "fail", "critical"]
149                        property["description"] = (
150                            f"{output_score.instruction}\n\nThe rating should be either 'pass', 'fail', or 'critical' where critical a very severe failure."
151                        )
152                case TaskOutputRatingType.custom:
153                    # Skip custom rating types in evals
154                    continue
155                case _:
156                    raise_exhaustive_enum_error(output_score.type)
157
158            properties[output_score_json_key] = property
159
160        schema = {
161            "type": "object",
162            "properties": properties,
163            "required": list(properties.keys()),
164        }
165        return json.dumps(schema, ensure_ascii=False)

Base class for all evals/evaluators.

Should be subclassed, and the run_eval method implemented.

BaseEval( eval_config: kiln_ai.datamodel.eval.EvalConfig, run_config: kiln_ai.datamodel.task.RunConfig | None)
22    def __init__(self, eval_config: EvalConfig, run_config: RunConfig | None):
23        self.eval_config = eval_config
24        eval = eval_config.parent_eval()
25        if not eval:
26            raise ValueError("Eval config must have a parent eval")
27        self.eval = eval
28        task = self.eval.parent_task()
29        if not task:
30            raise ValueError("Eval must have a parent task")
31        self.target_task = task
32        self.score_schema = BaseEval.build_score_schema(eval, allow_float_scores=True)
33        self.run_config = run_config
eval_config
eval
target_task
score_schema
run_config
def model_and_provider(self) -> tuple[str, kiln_ai.adapters.ml_model_list.ModelProviderName]:
35    def model_and_provider(self) -> tuple[str, ModelProviderName]:
36        model_name = self.eval_config.model_name
37        provider = self.eval_config.model_provider
38        if (
39            not model_name
40            or not provider
41            or not isinstance(model_name, str)
42            or not isinstance(provider, str)
43            or provider not in ModelProviderName.__members__
44        ):
45            raise ValueError(
46                "Model name and provider must be set in the eval config model properties"
47            )
48
49        return model_name, ModelProviderName(provider)
async def run_task_and_eval( self, input: str) -> tuple[kiln_ai.datamodel.TaskRun, typing.Dict[str, float], typing.Optional[typing.Dict[str, str]]]:
51    async def run_task_and_eval(
52        self, input: str
53    ) -> tuple[TaskRun, EvalScores, Dict[str, str] | None]:
54        """
55        Runs the task on the provided run_config to generate fresh output, then runs the eval on that output.
56        """
57        if self.run_config is None:
58            raise ValueError("Run config is required for run_task_and_eval")
59
60        run_adapter = adapter_for_task(
61            self.target_task,
62            self.run_config.model_name,
63            ModelProviderName(self.run_config.model_provider_name),
64            base_adapter_config=AdapterConfig(allow_saving=False),
65        )
66
67        # Parse structured input if needed
68        parsed_input = input
69        if self.target_task.output_json_schema is not None:
70            parsed_input = json.loads(input)
71
72        # we don't save by default here. We'll save manually after validating the output
73        run_output = await run_adapter.invoke(parsed_input)
74
75        eval_output, intermediate_outputs = await self.run_eval(run_output)
76        validate_schema(eval_output, self.score_schema)
77
78        return run_output, eval_output, intermediate_outputs

Runs the task on the provided run_config to generate fresh output, then runs the eval on that output.

@abstractmethod
async def run_eval( self, task_run: kiln_ai.datamodel.TaskRun) -> tuple[typing.Dict[str, float], typing.Optional[typing.Dict[str, str]]]:
80    @abstractmethod
81    async def run_eval(
82        self, task_run: TaskRun
83    ) -> tuple[EvalScores, Dict[str, str] | None]:
84        """
85        Runs the eval on the given task run.
86
87        Returns a dictionary of scores which should conform to the score schema, and a dictionary of intermediate outputs (eval thinking).
88        """
89        pass

Runs the eval on the given task run.

Returns a dictionary of scores which should conform to the score schema, and a dictionary of intermediate outputs (eval thinking).

@classmethod
def build_score_schema( cls, eval: kiln_ai.datamodel.eval.Eval, allow_float_scores: bool = False) -> str:
 91    @classmethod
 92    def build_score_schema(cls, eval: Eval, allow_float_scores: bool = False) -> str:
 93        """
 94        Build a JSON schema for the scoring output of the task requirements
 95
 96        We allow 2 modes: allow_float_scores=True and allow_float_scores=False.
 97
 98        allow_float_scores=False is used for the call to the model, and forces the model into selecting into discrete rating options (int 1-5, pass-fail, etc).
 99        allow_float_scores=True is used for final score output (for example, after we take a g-eval weighting of the model's logprobs). A pass/fail rating might return 0.75 for likely pass (as opposed to 0.99 for near certain pass), or a 1-5 score might return 3.75.
100        """
101
102        # Note: python maintains order, which is good as we want the user defined order, and overall last
103        properties = {}
104        for output_score in eval.output_scores:
105            output_score_json_key = output_score.json_key()
106
107            if len(output_score_json_key) == 0:
108                raise ValueError(
109                    f"Invalid output score name: {output_score.name}. Can not be used as JSON schema key."
110                )
111            property: dict[str, str | int | float | list[str] | list[int]] = {
112                "title": output_score.name,
113            }
114            match output_score.type:
115                case TaskOutputRatingType.five_star:
116                    if allow_float_scores:
117                        property["type"] = "number"
118                        property["minimum"] = 1
119                        property["maximum"] = 5
120                    else:
121                        property["enum"] = [1, 2, 3, 4, 5]
122
123                    property["description"] = (
124                        f"{output_score.instruction}\n\nThe rating should be between 1 and 5, with 1 being the worst and 5 being the best."
125                    )
126                case TaskOutputRatingType.pass_fail:
127                    if allow_float_scores:
128                        property["type"] = "number"
129                        property["minimum"] = 0
130                        property["maximum"] = 1
131                        property["description"] = (
132                            f"{output_score.instruction}\n\nThe rating should be between 0 and 1, with 0 being a failure and 1 being a pass."
133                        )
134                    else:
135                        property["enum"] = ["pass", "fail"]
136                        property["description"] = (
137                            f"{output_score.instruction}\n\nThe rating should be either 'pass' or 'fail'."
138                        )
139                case TaskOutputRatingType.pass_fail_critical:
140                    if allow_float_scores:
141                        property["type"] = "number"
142                        property["minimum"] = -1
143                        property["maximum"] = 1
144                        property["description"] = (
145                            f"{output_score.instruction}\n\nThe rating should be between -1 and 1, with 1 being a pass, 0 being a failure, and -1 being a critical failure (very severe failure)."
146                        )
147                    else:
148                        property["enum"] = ["pass", "fail", "critical"]
149                        property["description"] = (
150                            f"{output_score.instruction}\n\nThe rating should be either 'pass', 'fail', or 'critical' where critical a very severe failure."
151                        )
152                case TaskOutputRatingType.custom:
153                    # Skip custom rating types in evals
154                    continue
155                case _:
156                    raise_exhaustive_enum_error(output_score.type)
157
158            properties[output_score_json_key] = property
159
160        schema = {
161            "type": "object",
162            "properties": properties,
163            "required": list(properties.keys()),
164        }
165        return json.dumps(schema, ensure_ascii=False)

Build a JSON schema for the scoring output of the task requirements

We allow 2 modes: allow_float_scores=True and allow_float_scores=False.

allow_float_scores=False is used for the call to the model, and forces the model into selecting into discrete rating options (int 1-5, pass-fail, etc). allow_float_scores=True is used for final score output (for example, after we take a g-eval weighting of the model's logprobs). A pass/fail rating might return 0.75 for likely pass (as opposed to 0.99 for near certain pass), or a 1-5 score might return 3.75.