kiln_ai.adapters.eval.g_eval

  1import math
  2from typing import Dict, List, Tuple
  3
  4from litellm.types.utils import ChatCompletionTokenLogprob
  5
  6from kiln_ai.adapters.adapter_registry import adapter_for_task
  7from kiln_ai.adapters.eval.base_eval import BaseEval
  8from kiln_ai.adapters.model_adapters.base_adapter import AdapterConfig, RunOutput
  9from kiln_ai.adapters.prompt_builders import PromptGenerators
 10from kiln_ai.datamodel import Project, Task, TaskRun
 11from kiln_ai.datamodel.eval import EvalConfig, EvalConfigType, EvalScores
 12from kiln_ai.datamodel.task import RunConfig
 13
 14# all the tokens we score for, and their float scores.
 15TOKEN_TO_SCORE_MAP: Dict[str, float] = {
 16    "1": 1.0,
 17    "2": 2.0,
 18    "3": 3.0,
 19    "4": 4.0,
 20    "5": 5.0,
 21    "pass": 1.0,
 22    "fail": 0.0,
 23    "critical": -1.0,
 24}
 25
 26
 27class GEvalTask(Task, parent_of={}):
 28    """
 29    Kiln task for executing a G-Eval. Can be run on any Kiln adapter which supports logprobs.
 30
 31    Note G-Eval implements both G-Eval and LLM as Judge as they are very similar.
 32    """
 33
 34    def __init__(self, eval_config: EvalConfig):
 35        tmp_project = Project(name="GEval")
 36
 37        # Build a simple LLM as Judge system instruction
 38        system_instruction = "Your job to evaluate a model's performance on a task. Blocks will be marked with <eval_data> tags.\n"
 39        # Optionally add a short task description
 40        task_description = eval_config.properties.get("task_description", None)
 41        if task_description:
 42            system_instruction += f"\nThe task the model was given is as follows:\n<eval_data>\n{task_description}\n</eval_data>\n"
 43
 44        # Build the COT eval instructions
 45        cot_instructions = "First, think step by step about the model's performance following these evaluation steps:\n\n"
 46        steps = eval_config.properties.get("eval_steps", None)
 47        if not steps or not isinstance(steps, list):
 48            raise ValueError("eval_steps must be a list")
 49        for i, step in enumerate(steps):
 50            cot_instructions += f"{i + 1}) {step}\n"
 51
 52        eval = eval_config.parent_eval()
 53        if not eval:
 54            raise ValueError("Eval config must have a parent eval")
 55
 56        # Build the output schema from the eval's target output scores.
 57        # We restrict the LLM's output scoring schema to discrete scores (pass/fail/critical/1-5) - allow_float_scores=False
 58        # However, the final scores from the evaluator can be a float (see later logprob calculation, which requires discrete token outputs)
 59        output_schema = BaseEval.build_score_schema(eval, allow_float_scores=False)
 60
 61        super().__init__(
 62            name="GEval Task",
 63            parent=tmp_project,
 64            instruction=system_instruction,
 65            thinking_instruction=cot_instructions,
 66            output_json_schema=output_schema,
 67        )
 68
 69
 70class GEval(BaseEval):
 71    """
 72    A evaluator which implements G-Eval and LLM as Judge.
 73
 74    G-Eval is a method of evaluating the quality of a model's output. It is a weighted average of the scores of the tokens in the output. The weights are the log probabilities of the tokens in the output. https://arxiv.org/abs/2303.16634
 75
 76    LLM as Judge is a method of evaluating the quality of a model's output. It simply asks the LLM to score, and uses the returned output (no logprobs needed). Also called direct evaluation.
 77
 78    @misc{liu2023gevalnlgevaluationusing,
 79        title={G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment},
 80        author={Yang Liu and Dan Iter and Yichong Xu and Shuohang Wang and Ruochen Xu and Chenguang Zhu},
 81        year={2023},
 82        eprint={2303.16634},
 83        archivePrefix={arXiv},
 84        primaryClass={cs.CL},
 85        url={https://arxiv.org/abs/2303.16634},
 86    }
 87    """
 88
 89    def __init__(self, eval_config: EvalConfig, run_config: RunConfig | None):
 90        if (
 91            eval_config.config_type != EvalConfigType.g_eval
 92            and eval_config.config_type != EvalConfigType.llm_as_judge
 93        ):
 94            raise ValueError(
 95                f"GEval must be initialized with a GEval or LLM as Judge config_type. Got {eval_config.config_type}"
 96            )
 97
 98        super().__init__(eval_config, run_config)
 99
100        self.geval_task = GEvalTask(eval_config)
101
102    async def run_eval(
103        self, task_run: TaskRun
104    ) -> tuple[EvalScores, Dict[str, str] | None]:
105        """
106        Run this eval on the given task run.
107        """
108
109        model_name, provider = self.model_and_provider()
110
111        # Only fetch logprobs for G-Eval
112        # There are at most 5 valid rating tokens per rating type (five_star being largest), so 10 is more than enough to get to the very very unlikely
113        top_logprobs = (
114            10 if self.eval_config.config_type == EvalConfigType.g_eval else None
115        )
116
117        adapter = adapter_for_task(
118            self.geval_task,
119            model_name,
120            provider,
121            # We always use Simple COT for G-Eval and LLM as Judge
122            prompt_id=PromptGenerators.SIMPLE_CHAIN_OF_THOUGHT,
123            base_adapter_config=AdapterConfig(
124                # Don't save this run into the task_runs. It will be saved into an eval_run where it belongs
125                allow_saving=False,
126                top_logprobs=top_logprobs,
127            ),
128        )
129
130        input = f"""The model was given the following input for the task: 
131<eval_data>
132{task_run.input}
133</eval_data>
134
135The model produced the following output for the task:
136<eval_data>
137{task_run.output}
138</eval_data>
139"""
140
141        # We don't need the run, but invoke_returning_run_output() runs validations for us over _run()
142        _, run_output = await adapter.invoke_returning_run_output(input)
143
144        if self.eval_config.config_type == EvalConfigType.llm_as_judge:
145            return self.build_llm_as_judge_score(
146                run_output
147            ), run_output.intermediate_outputs
148        else:
149            return self.build_g_eval_score(run_output), run_output.intermediate_outputs
150
151    def build_llm_as_judge_score(self, run_output: RunOutput) -> EvalScores:
152        """
153        Build the LLM as Judge score for the given run and run output.
154        """
155        # Convert the output format we asked for (discreet values) to our float scores
156        scores: EvalScores = {}
157        if not isinstance(run_output.output, dict):
158            raise ValueError("LLM as Judge output must be a dictionary")
159
160        for metric, score in run_output.output.items():
161            token_score = self.score_from_token_string(f"{score}")
162            if token_score is None:
163                raise ValueError(
164                    f"No score found for metric: {metric}. The LLM failed to follow the scoring rubric/instructions/schema."
165                )
166            scores[metric] = token_score
167        return scores
168
169    def build_g_eval_score(self, run_output: RunOutput) -> EvalScores:
170        """
171        Build the G-Eval score for the given run and run output.
172
173        We create a weighted average of each rating using the logprobs.
174
175        @misc{liu2023gevalnlgevaluationusing,
176            title={G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment},
177            author={Yang Liu and Dan Iter and Yichong Xu and Shuohang Wang and Ruochen Xu and Chenguang Zhu},
178            year={2023},
179            eprint={2303.16634},
180            archivePrefix={arXiv},
181            primaryClass={cs.CL},
182            url={https://arxiv.org/abs/2303.16634},
183        }
184        """
185        # We use structured output
186        outputs = run_output.output
187        assert isinstance(outputs, dict)
188
189        # Build raw string output from the logprobs, which is easier to work with than Dict for the next bit
190        raw_output = self.raw_output_from_logprobs(run_output)
191
192        # find the offset the start of each metric in the raw output json
193        metrics: List[str] = list(outputs.keys())
194        metric_offsets = self.metric_offsets(raw_output, metrics)
195
196        final_scores: EvalScores = {}
197        for metric in metrics:
198            score = self.g_eval_single_metric(
199                run_output, metric, metric_offsets, raw_output
200            )
201            if score is None:
202                raise ValueError(
203                    f"No score found for metric: {metric}. The LLM failed to follow the scoring rubric/instructions/schema."
204                )
205            final_scores[metric] = score
206
207        return final_scores
208
209    def g_eval_single_metric(
210        self,
211        run_output: RunOutput,
212        metric: str,
213        metric_offsets: Dict[str, int],
214        raw_output: str,
215    ) -> float | None:
216        """
217        Run the G-Eval for a single metric.
218
219        Scan the logprobs for the metric and return the weighted score of the rating token.
220        """
221
222        start_offset, end_offset = self.token_search_range(
223            raw_output, metric, metric_offsets
224        )
225
226        offset = 0
227
228        if (
229            run_output.output_logprobs is None
230            or run_output.output_logprobs.content is None
231        ):
232            raise RuntimeError(
233                "No logprobs found for output - can not calculate g-eval"
234            )
235
236        # scan the tokens in the range, looking for the rating token
237        for _, chat_logprob in enumerate(run_output.output_logprobs.content):
238            if offset >= end_offset:
239                break
240            if offset >= start_offset:
241                score = self.rating_token_to_score(chat_logprob)
242                if score is not None:
243                    return score
244            offset += len(chat_logprob.token)
245
246        return None
247
248    def raw_output_from_logprobs(self, run_output: RunOutput) -> str:
249        """
250        Build the raw output string from the logprobs. Generate from logprobs so it's guaranteed to match the logprobs offsets
251        """
252        if (
253            run_output.output_logprobs is None
254            or run_output.output_logprobs.content is None
255        ):
256            raise RuntimeError(
257                "No logprobs found for output - can not calculate g-eval"
258            )
259
260        raw = ""
261        for chat_logprob in run_output.output_logprobs.content:
262            raw += chat_logprob.token
263        return raw
264
265    def token_search_range(
266        self, raw_output: str, metric: str, metric_offsets: Dict[str, int]
267    ) -> Tuple[int, int]:
268        """
269        Find the start and end offsets of the metric in the raw output.
270
271        Start searching after the end of the target metric json entry ("overall_rating":), and before the start of the next metric ("some_other_score").
272        """
273        start_offset = metric_offsets[metric] + len(metric)
274
275        # Find the lowest end offset that is greater than the start offset
276        end_offset = len(raw_output)
277        for v in list(metric_offsets.values()):
278            if v < end_offset and v > start_offset:
279                end_offset = v
280
281        return start_offset, end_offset
282
283    def rating_token_to_score(
284        self, token_logprob: ChatCompletionTokenLogprob
285    ) -> float | None:
286        """
287        Convert a rating token to a score using weighted average of top logprobs.
288
289        Only includes tokens that have valid scores.
290
291        Some cleanup for upper case, whitespace and quotes. LLMs aren't always consistent.
292        """
293        primary_token_score = self.score_from_token_string(token_logprob.token)
294        # check this is a real rating token, it could just be the ": ", "," or whitespace
295        if not primary_token_score:
296            return None
297
298        total_score = 0.0
299        total_probability = 0.0
300        top_logprobs_contains_primary_token = False
301
302        # Process all valid scoring tokens from alternatives
303        for top_logprob in token_logprob.top_logprobs:
304            if top_logprob.token == token_logprob.token:
305                top_logprobs_contains_primary_token = True
306            token_score = self.score_from_token_string(top_logprob.token)
307            if token_score is not None:
308                # Convert logprob to probability
309                probability = math.exp(top_logprob.logprob)
310                total_score += token_score * probability
311                total_probability += probability
312
313        # Weird OpenAI 4o bug - sometimes the primary token is included in the top logprobs, sometimes not.
314        # Add the primary token back in if excluded
315        if not top_logprobs_contains_primary_token:
316            if token_logprob.logprob == -9999.0:
317                # Another "bug" - sometimes the logprob is -9999.0. This seems to happen when the rest of the logprobs are tiny probability.
318                total_score += primary_token_score * 1.0
319                total_probability += 1.0
320            else:
321                probability = math.exp(token_logprob.logprob)
322                total_score += primary_token_score * probability
323                total_probability += probability
324
325        if total_probability <= 0.0:
326            raise RuntimeError(
327                f"No valid scoring tokens found for {token_logprob.token}. This should never happen as the token has a valid score (so it must be excluded from top logprobs). Please file a bug if you see this."
328            )
329
330        # Normalize by total probability of valid tokens (LLM may have wanted to generate other non-rating tokens, these shouldn't lower score of rating tokens)
331        weighted_score = total_score / total_probability
332
333        return weighted_score
334
335    def score_from_token_string(self, token: str) -> float | None:
336        if token in TOKEN_TO_SCORE_MAP:
337            return TOKEN_TO_SCORE_MAP[token]
338
339        # handle more token variations like '"1"' and '"pass"' and ' paSS' and 'PASS'
340        unquoted_token = token.strip().strip('"').lower()
341        if unquoted_token in TOKEN_TO_SCORE_MAP:
342            return TOKEN_TO_SCORE_MAP[unquoted_token]
343
344        # handle numeric tokens like "1.0"
345        try:
346            float_value = float(token)
347            if float_value.is_integer():
348                str_token = str(int(float_value))
349                if str_token in TOKEN_TO_SCORE_MAP:
350                    return TOKEN_TO_SCORE_MAP[str_token]
351        except ValueError:
352            pass
353
354        return None
355
356    def metric_offsets(self, raw_output: str, metrics: List[str]) -> Dict[str, int]:
357        """
358        Find the offset to the start of each metric in the raw output json
359
360        For the example json: `{"overall_rating": 1}` == 1
361
362        should return:
363        {
364            "overall_rating": 1 # it's 1 character into the json string
365        }
366        """
367        metric_offsets: Dict[str, int] = {}
368        for metric in metrics:
369            # the quoted metric name is expected in the json: `{"overall_rating": 1}` == 1
370            metric_name = f'"{metric}"'
371
372            # we expect it exactly once
373            count = raw_output.count(metric_name)
374            if count != 1:
375                raise ValueError(
376                    f"Metric {metric} should appear exactly once in the output. Found {count} times"
377                )
378
379            offset = raw_output.find(metric_name)
380            if offset == -1:
381                raise ValueError(f"Metric {metric} not found in raw output")
382            metric_offsets[metric] = offset
383        return metric_offsets
TOKEN_TO_SCORE_MAP: Dict[str, float] = {'1': 1.0, '2': 2.0, '3': 3.0, '4': 4.0, '5': 5.0, 'pass': 1.0, 'fail': 0.0, 'critical': -1.0}
class GEvalTask(kiln_ai.datamodel.task.Task):
28class GEvalTask(Task, parent_of={}):
29    """
30    Kiln task for executing a G-Eval. Can be run on any Kiln adapter which supports logprobs.
31
32    Note G-Eval implements both G-Eval and LLM as Judge as they are very similar.
33    """
34
35    def __init__(self, eval_config: EvalConfig):
36        tmp_project = Project(name="GEval")
37
38        # Build a simple LLM as Judge system instruction
39        system_instruction = "Your job to evaluate a model's performance on a task. Blocks will be marked with <eval_data> tags.\n"
40        # Optionally add a short task description
41        task_description = eval_config.properties.get("task_description", None)
42        if task_description:
43            system_instruction += f"\nThe task the model was given is as follows:\n<eval_data>\n{task_description}\n</eval_data>\n"
44
45        # Build the COT eval instructions
46        cot_instructions = "First, think step by step about the model's performance following these evaluation steps:\n\n"
47        steps = eval_config.properties.get("eval_steps", None)
48        if not steps or not isinstance(steps, list):
49            raise ValueError("eval_steps must be a list")
50        for i, step in enumerate(steps):
51            cot_instructions += f"{i + 1}) {step}\n"
52
53        eval = eval_config.parent_eval()
54        if not eval:
55            raise ValueError("Eval config must have a parent eval")
56
57        # Build the output schema from the eval's target output scores.
58        # We restrict the LLM's output scoring schema to discrete scores (pass/fail/critical/1-5) - allow_float_scores=False
59        # However, the final scores from the evaluator can be a float (see later logprob calculation, which requires discrete token outputs)
60        output_schema = BaseEval.build_score_schema(eval, allow_float_scores=False)
61
62        super().__init__(
63            name="GEval Task",
64            parent=tmp_project,
65            instruction=system_instruction,
66            thinking_instruction=cot_instructions,
67            output_json_schema=output_schema,
68        )

Kiln task for executing a G-Eval. Can be run on any Kiln adapter which supports logprobs.

Note G-Eval implements both G-Eval and LLM as Judge as they are very similar.

GEvalTask(eval_config: kiln_ai.datamodel.eval.EvalConfig)
35    def __init__(self, eval_config: EvalConfig):
36        tmp_project = Project(name="GEval")
37
38        # Build a simple LLM as Judge system instruction
39        system_instruction = "Your job to evaluate a model's performance on a task. Blocks will be marked with <eval_data> tags.\n"
40        # Optionally add a short task description
41        task_description = eval_config.properties.get("task_description", None)
42        if task_description:
43            system_instruction += f"\nThe task the model was given is as follows:\n<eval_data>\n{task_description}\n</eval_data>\n"
44
45        # Build the COT eval instructions
46        cot_instructions = "First, think step by step about the model's performance following these evaluation steps:\n\n"
47        steps = eval_config.properties.get("eval_steps", None)
48        if not steps or not isinstance(steps, list):
49            raise ValueError("eval_steps must be a list")
50        for i, step in enumerate(steps):
51            cot_instructions += f"{i + 1}) {step}\n"
52
53        eval = eval_config.parent_eval()
54        if not eval:
55            raise ValueError("Eval config must have a parent eval")
56
57        # Build the output schema from the eval's target output scores.
58        # We restrict the LLM's output scoring schema to discrete scores (pass/fail/critical/1-5) - allow_float_scores=False
59        # However, the final scores from the evaluator can be a float (see later logprob calculation, which requires discrete token outputs)
60        output_schema = BaseEval.build_score_schema(eval, allow_float_scores=False)
61
62        super().__init__(
63            name="GEval Task",
64            parent=tmp_project,
65            instruction=system_instruction,
66            thinking_instruction=cot_instructions,
67            output_json_schema=output_schema,
68        )

Create a new model by parsing and validating input data from keyword arguments.

Raises [ValidationError][pydantic_core.ValidationError] if the input data cannot be validated to form a valid model.

self is explicitly positional-only to allow self as a field name.

model_config = {'validate_assignment': True}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

def model_post_init(self: pydantic.main.BaseModel, context: Any, /) -> None:
122                    def wrapped_model_post_init(self: BaseModel, context: Any, /) -> None:
123                        """We need to both initialize private attributes and call the user-defined model_post_init
124                        method.
125                        """
126                        init_private_attributes(self, context)
127                        original_model_post_init(self, context)

We need to both initialize private attributes and call the user-defined model_post_init method.

class GEval(kiln_ai.adapters.eval.base_eval.BaseEval):
 71class GEval(BaseEval):
 72    """
 73    A evaluator which implements G-Eval and LLM as Judge.
 74
 75    G-Eval is a method of evaluating the quality of a model's output. It is a weighted average of the scores of the tokens in the output. The weights are the log probabilities of the tokens in the output. https://arxiv.org/abs/2303.16634
 76
 77    LLM as Judge is a method of evaluating the quality of a model's output. It simply asks the LLM to score, and uses the returned output (no logprobs needed). Also called direct evaluation.
 78
 79    @misc{liu2023gevalnlgevaluationusing,
 80        title={G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment},
 81        author={Yang Liu and Dan Iter and Yichong Xu and Shuohang Wang and Ruochen Xu and Chenguang Zhu},
 82        year={2023},
 83        eprint={2303.16634},
 84        archivePrefix={arXiv},
 85        primaryClass={cs.CL},
 86        url={https://arxiv.org/abs/2303.16634},
 87    }
 88    """
 89
 90    def __init__(self, eval_config: EvalConfig, run_config: RunConfig | None):
 91        if (
 92            eval_config.config_type != EvalConfigType.g_eval
 93            and eval_config.config_type != EvalConfigType.llm_as_judge
 94        ):
 95            raise ValueError(
 96                f"GEval must be initialized with a GEval or LLM as Judge config_type. Got {eval_config.config_type}"
 97            )
 98
 99        super().__init__(eval_config, run_config)
100
101        self.geval_task = GEvalTask(eval_config)
102
103    async def run_eval(
104        self, task_run: TaskRun
105    ) -> tuple[EvalScores, Dict[str, str] | None]:
106        """
107        Run this eval on the given task run.
108        """
109
110        model_name, provider = self.model_and_provider()
111
112        # Only fetch logprobs for G-Eval
113        # There are at most 5 valid rating tokens per rating type (five_star being largest), so 10 is more than enough to get to the very very unlikely
114        top_logprobs = (
115            10 if self.eval_config.config_type == EvalConfigType.g_eval else None
116        )
117
118        adapter = adapter_for_task(
119            self.geval_task,
120            model_name,
121            provider,
122            # We always use Simple COT for G-Eval and LLM as Judge
123            prompt_id=PromptGenerators.SIMPLE_CHAIN_OF_THOUGHT,
124            base_adapter_config=AdapterConfig(
125                # Don't save this run into the task_runs. It will be saved into an eval_run where it belongs
126                allow_saving=False,
127                top_logprobs=top_logprobs,
128            ),
129        )
130
131        input = f"""The model was given the following input for the task: 
132<eval_data>
133{task_run.input}
134</eval_data>
135
136The model produced the following output for the task:
137<eval_data>
138{task_run.output}
139</eval_data>
140"""
141
142        # We don't need the run, but invoke_returning_run_output() runs validations for us over _run()
143        _, run_output = await adapter.invoke_returning_run_output(input)
144
145        if self.eval_config.config_type == EvalConfigType.llm_as_judge:
146            return self.build_llm_as_judge_score(
147                run_output
148            ), run_output.intermediate_outputs
149        else:
150            return self.build_g_eval_score(run_output), run_output.intermediate_outputs
151
152    def build_llm_as_judge_score(self, run_output: RunOutput) -> EvalScores:
153        """
154        Build the LLM as Judge score for the given run and run output.
155        """
156        # Convert the output format we asked for (discreet values) to our float scores
157        scores: EvalScores = {}
158        if not isinstance(run_output.output, dict):
159            raise ValueError("LLM as Judge output must be a dictionary")
160
161        for metric, score in run_output.output.items():
162            token_score = self.score_from_token_string(f"{score}")
163            if token_score is None:
164                raise ValueError(
165                    f"No score found for metric: {metric}. The LLM failed to follow the scoring rubric/instructions/schema."
166                )
167            scores[metric] = token_score
168        return scores
169
170    def build_g_eval_score(self, run_output: RunOutput) -> EvalScores:
171        """
172        Build the G-Eval score for the given run and run output.
173
174        We create a weighted average of each rating using the logprobs.
175
176        @misc{liu2023gevalnlgevaluationusing,
177            title={G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment},
178            author={Yang Liu and Dan Iter and Yichong Xu and Shuohang Wang and Ruochen Xu and Chenguang Zhu},
179            year={2023},
180            eprint={2303.16634},
181            archivePrefix={arXiv},
182            primaryClass={cs.CL},
183            url={https://arxiv.org/abs/2303.16634},
184        }
185        """
186        # We use structured output
187        outputs = run_output.output
188        assert isinstance(outputs, dict)
189
190        # Build raw string output from the logprobs, which is easier to work with than Dict for the next bit
191        raw_output = self.raw_output_from_logprobs(run_output)
192
193        # find the offset the start of each metric in the raw output json
194        metrics: List[str] = list(outputs.keys())
195        metric_offsets = self.metric_offsets(raw_output, metrics)
196
197        final_scores: EvalScores = {}
198        for metric in metrics:
199            score = self.g_eval_single_metric(
200                run_output, metric, metric_offsets, raw_output
201            )
202            if score is None:
203                raise ValueError(
204                    f"No score found for metric: {metric}. The LLM failed to follow the scoring rubric/instructions/schema."
205                )
206            final_scores[metric] = score
207
208        return final_scores
209
210    def g_eval_single_metric(
211        self,
212        run_output: RunOutput,
213        metric: str,
214        metric_offsets: Dict[str, int],
215        raw_output: str,
216    ) -> float | None:
217        """
218        Run the G-Eval for a single metric.
219
220        Scan the logprobs for the metric and return the weighted score of the rating token.
221        """
222
223        start_offset, end_offset = self.token_search_range(
224            raw_output, metric, metric_offsets
225        )
226
227        offset = 0
228
229        if (
230            run_output.output_logprobs is None
231            or run_output.output_logprobs.content is None
232        ):
233            raise RuntimeError(
234                "No logprobs found for output - can not calculate g-eval"
235            )
236
237        # scan the tokens in the range, looking for the rating token
238        for _, chat_logprob in enumerate(run_output.output_logprobs.content):
239            if offset >= end_offset:
240                break
241            if offset >= start_offset:
242                score = self.rating_token_to_score(chat_logprob)
243                if score is not None:
244                    return score
245            offset += len(chat_logprob.token)
246
247        return None
248
249    def raw_output_from_logprobs(self, run_output: RunOutput) -> str:
250        """
251        Build the raw output string from the logprobs. Generate from logprobs so it's guaranteed to match the logprobs offsets
252        """
253        if (
254            run_output.output_logprobs is None
255            or run_output.output_logprobs.content is None
256        ):
257            raise RuntimeError(
258                "No logprobs found for output - can not calculate g-eval"
259            )
260
261        raw = ""
262        for chat_logprob in run_output.output_logprobs.content:
263            raw += chat_logprob.token
264        return raw
265
266    def token_search_range(
267        self, raw_output: str, metric: str, metric_offsets: Dict[str, int]
268    ) -> Tuple[int, int]:
269        """
270        Find the start and end offsets of the metric in the raw output.
271
272        Start searching after the end of the target metric json entry ("overall_rating":), and before the start of the next metric ("some_other_score").
273        """
274        start_offset = metric_offsets[metric] + len(metric)
275
276        # Find the lowest end offset that is greater than the start offset
277        end_offset = len(raw_output)
278        for v in list(metric_offsets.values()):
279            if v < end_offset and v > start_offset:
280                end_offset = v
281
282        return start_offset, end_offset
283
284    def rating_token_to_score(
285        self, token_logprob: ChatCompletionTokenLogprob
286    ) -> float | None:
287        """
288        Convert a rating token to a score using weighted average of top logprobs.
289
290        Only includes tokens that have valid scores.
291
292        Some cleanup for upper case, whitespace and quotes. LLMs aren't always consistent.
293        """
294        primary_token_score = self.score_from_token_string(token_logprob.token)
295        # check this is a real rating token, it could just be the ": ", "," or whitespace
296        if not primary_token_score:
297            return None
298
299        total_score = 0.0
300        total_probability = 0.0
301        top_logprobs_contains_primary_token = False
302
303        # Process all valid scoring tokens from alternatives
304        for top_logprob in token_logprob.top_logprobs:
305            if top_logprob.token == token_logprob.token:
306                top_logprobs_contains_primary_token = True
307            token_score = self.score_from_token_string(top_logprob.token)
308            if token_score is not None:
309                # Convert logprob to probability
310                probability = math.exp(top_logprob.logprob)
311                total_score += token_score * probability
312                total_probability += probability
313
314        # Weird OpenAI 4o bug - sometimes the primary token is included in the top logprobs, sometimes not.
315        # Add the primary token back in if excluded
316        if not top_logprobs_contains_primary_token:
317            if token_logprob.logprob == -9999.0:
318                # Another "bug" - sometimes the logprob is -9999.0. This seems to happen when the rest of the logprobs are tiny probability.
319                total_score += primary_token_score * 1.0
320                total_probability += 1.0
321            else:
322                probability = math.exp(token_logprob.logprob)
323                total_score += primary_token_score * probability
324                total_probability += probability
325
326        if total_probability <= 0.0:
327            raise RuntimeError(
328                f"No valid scoring tokens found for {token_logprob.token}. This should never happen as the token has a valid score (so it must be excluded from top logprobs). Please file a bug if you see this."
329            )
330
331        # Normalize by total probability of valid tokens (LLM may have wanted to generate other non-rating tokens, these shouldn't lower score of rating tokens)
332        weighted_score = total_score / total_probability
333
334        return weighted_score
335
336    def score_from_token_string(self, token: str) -> float | None:
337        if token in TOKEN_TO_SCORE_MAP:
338            return TOKEN_TO_SCORE_MAP[token]
339
340        # handle more token variations like '"1"' and '"pass"' and ' paSS' and 'PASS'
341        unquoted_token = token.strip().strip('"').lower()
342        if unquoted_token in TOKEN_TO_SCORE_MAP:
343            return TOKEN_TO_SCORE_MAP[unquoted_token]
344
345        # handle numeric tokens like "1.0"
346        try:
347            float_value = float(token)
348            if float_value.is_integer():
349                str_token = str(int(float_value))
350                if str_token in TOKEN_TO_SCORE_MAP:
351                    return TOKEN_TO_SCORE_MAP[str_token]
352        except ValueError:
353            pass
354
355        return None
356
357    def metric_offsets(self, raw_output: str, metrics: List[str]) -> Dict[str, int]:
358        """
359        Find the offset to the start of each metric in the raw output json
360
361        For the example json: `{"overall_rating": 1}` == 1
362
363        should return:
364        {
365            "overall_rating": 1 # it's 1 character into the json string
366        }
367        """
368        metric_offsets: Dict[str, int] = {}
369        for metric in metrics:
370            # the quoted metric name is expected in the json: `{"overall_rating": 1}` == 1
371            metric_name = f'"{metric}"'
372
373            # we expect it exactly once
374            count = raw_output.count(metric_name)
375            if count != 1:
376                raise ValueError(
377                    f"Metric {metric} should appear exactly once in the output. Found {count} times"
378                )
379
380            offset = raw_output.find(metric_name)
381            if offset == -1:
382                raise ValueError(f"Metric {metric} not found in raw output")
383            metric_offsets[metric] = offset
384        return metric_offsets

A evaluator which implements G-Eval and LLM as Judge.

G-Eval is a method of evaluating the quality of a model's output. It is a weighted average of the scores of the tokens in the output. The weights are the log probabilities of the tokens in the output. https://arxiv.org/abs/2303.16634

LLM as Judge is a method of evaluating the quality of a model's output. It simply asks the LLM to score, and uses the returned output (no logprobs needed). Also called direct evaluation.

@misc{liu2023gevalnlgevaluationusing, title={G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment}, author={Yang Liu and Dan Iter and Yichong Xu and Shuohang Wang and Ruochen Xu and Chenguang Zhu}, year={2023}, eprint={2303.16634}, archivePrefix={arXiv}, primaryClass={cs.CL}, url={https://arxiv.org/abs/2303.16634}, }

GEval( eval_config: kiln_ai.datamodel.eval.EvalConfig, run_config: kiln_ai.datamodel.task.RunConfig | None)
 90    def __init__(self, eval_config: EvalConfig, run_config: RunConfig | None):
 91        if (
 92            eval_config.config_type != EvalConfigType.g_eval
 93            and eval_config.config_type != EvalConfigType.llm_as_judge
 94        ):
 95            raise ValueError(
 96                f"GEval must be initialized with a GEval or LLM as Judge config_type. Got {eval_config.config_type}"
 97            )
 98
 99        super().__init__(eval_config, run_config)
100
101        self.geval_task = GEvalTask(eval_config)
geval_task
async def run_eval( self, task_run: kiln_ai.datamodel.TaskRun) -> tuple[typing.Dict[str, float], typing.Optional[typing.Dict[str, str]]]:
103    async def run_eval(
104        self, task_run: TaskRun
105    ) -> tuple[EvalScores, Dict[str, str] | None]:
106        """
107        Run this eval on the given task run.
108        """
109
110        model_name, provider = self.model_and_provider()
111
112        # Only fetch logprobs for G-Eval
113        # There are at most 5 valid rating tokens per rating type (five_star being largest), so 10 is more than enough to get to the very very unlikely
114        top_logprobs = (
115            10 if self.eval_config.config_type == EvalConfigType.g_eval else None
116        )
117
118        adapter = adapter_for_task(
119            self.geval_task,
120            model_name,
121            provider,
122            # We always use Simple COT for G-Eval and LLM as Judge
123            prompt_id=PromptGenerators.SIMPLE_CHAIN_OF_THOUGHT,
124            base_adapter_config=AdapterConfig(
125                # Don't save this run into the task_runs. It will be saved into an eval_run where it belongs
126                allow_saving=False,
127                top_logprobs=top_logprobs,
128            ),
129        )
130
131        input = f"""The model was given the following input for the task: 
132<eval_data>
133{task_run.input}
134</eval_data>
135
136The model produced the following output for the task:
137<eval_data>
138{task_run.output}
139</eval_data>
140"""
141
142        # We don't need the run, but invoke_returning_run_output() runs validations for us over _run()
143        _, run_output = await adapter.invoke_returning_run_output(input)
144
145        if self.eval_config.config_type == EvalConfigType.llm_as_judge:
146            return self.build_llm_as_judge_score(
147                run_output
148            ), run_output.intermediate_outputs
149        else:
150            return self.build_g_eval_score(run_output), run_output.intermediate_outputs

Run this eval on the given task run.

def build_llm_as_judge_score( self, run_output: kiln_ai.adapters.run_output.RunOutput) -> Dict[str, float]:
152    def build_llm_as_judge_score(self, run_output: RunOutput) -> EvalScores:
153        """
154        Build the LLM as Judge score for the given run and run output.
155        """
156        # Convert the output format we asked for (discreet values) to our float scores
157        scores: EvalScores = {}
158        if not isinstance(run_output.output, dict):
159            raise ValueError("LLM as Judge output must be a dictionary")
160
161        for metric, score in run_output.output.items():
162            token_score = self.score_from_token_string(f"{score}")
163            if token_score is None:
164                raise ValueError(
165                    f"No score found for metric: {metric}. The LLM failed to follow the scoring rubric/instructions/schema."
166                )
167            scores[metric] = token_score
168        return scores

Build the LLM as Judge score for the given run and run output.

def build_g_eval_score( self, run_output: kiln_ai.adapters.run_output.RunOutput) -> Dict[str, float]:
170    def build_g_eval_score(self, run_output: RunOutput) -> EvalScores:
171        """
172        Build the G-Eval score for the given run and run output.
173
174        We create a weighted average of each rating using the logprobs.
175
176        @misc{liu2023gevalnlgevaluationusing,
177            title={G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment},
178            author={Yang Liu and Dan Iter and Yichong Xu and Shuohang Wang and Ruochen Xu and Chenguang Zhu},
179            year={2023},
180            eprint={2303.16634},
181            archivePrefix={arXiv},
182            primaryClass={cs.CL},
183            url={https://arxiv.org/abs/2303.16634},
184        }
185        """
186        # We use structured output
187        outputs = run_output.output
188        assert isinstance(outputs, dict)
189
190        # Build raw string output from the logprobs, which is easier to work with than Dict for the next bit
191        raw_output = self.raw_output_from_logprobs(run_output)
192
193        # find the offset the start of each metric in the raw output json
194        metrics: List[str] = list(outputs.keys())
195        metric_offsets = self.metric_offsets(raw_output, metrics)
196
197        final_scores: EvalScores = {}
198        for metric in metrics:
199            score = self.g_eval_single_metric(
200                run_output, metric, metric_offsets, raw_output
201            )
202            if score is None:
203                raise ValueError(
204                    f"No score found for metric: {metric}. The LLM failed to follow the scoring rubric/instructions/schema."
205                )
206            final_scores[metric] = score
207
208        return final_scores

Build the G-Eval score for the given run and run output.

We create a weighted average of each rating using the logprobs.

@misc{liu2023gevalnlgevaluationusing, title={G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment}, author={Yang Liu and Dan Iter and Yichong Xu and Shuohang Wang and Ruochen Xu and Chenguang Zhu}, year={2023}, eprint={2303.16634}, archivePrefix={arXiv}, primaryClass={cs.CL}, url={https://arxiv.org/abs/2303.16634}, }

def g_eval_single_metric( self, run_output: kiln_ai.adapters.run_output.RunOutput, metric: str, metric_offsets: Dict[str, int], raw_output: str) -> float | None:
210    def g_eval_single_metric(
211        self,
212        run_output: RunOutput,
213        metric: str,
214        metric_offsets: Dict[str, int],
215        raw_output: str,
216    ) -> float | None:
217        """
218        Run the G-Eval for a single metric.
219
220        Scan the logprobs for the metric and return the weighted score of the rating token.
221        """
222
223        start_offset, end_offset = self.token_search_range(
224            raw_output, metric, metric_offsets
225        )
226
227        offset = 0
228
229        if (
230            run_output.output_logprobs is None
231            or run_output.output_logprobs.content is None
232        ):
233            raise RuntimeError(
234                "No logprobs found for output - can not calculate g-eval"
235            )
236
237        # scan the tokens in the range, looking for the rating token
238        for _, chat_logprob in enumerate(run_output.output_logprobs.content):
239            if offset >= end_offset:
240                break
241            if offset >= start_offset:
242                score = self.rating_token_to_score(chat_logprob)
243                if score is not None:
244                    return score
245            offset += len(chat_logprob.token)
246
247        return None

Run the G-Eval for a single metric.

Scan the logprobs for the metric and return the weighted score of the rating token.

def raw_output_from_logprobs(self, run_output: kiln_ai.adapters.run_output.RunOutput) -> str:
249    def raw_output_from_logprobs(self, run_output: RunOutput) -> str:
250        """
251        Build the raw output string from the logprobs. Generate from logprobs so it's guaranteed to match the logprobs offsets
252        """
253        if (
254            run_output.output_logprobs is None
255            or run_output.output_logprobs.content is None
256        ):
257            raise RuntimeError(
258                "No logprobs found for output - can not calculate g-eval"
259            )
260
261        raw = ""
262        for chat_logprob in run_output.output_logprobs.content:
263            raw += chat_logprob.token
264        return raw

Build the raw output string from the logprobs. Generate from logprobs so it's guaranteed to match the logprobs offsets

def token_search_range( self, raw_output: str, metric: str, metric_offsets: Dict[str, int]) -> Tuple[int, int]:
266    def token_search_range(
267        self, raw_output: str, metric: str, metric_offsets: Dict[str, int]
268    ) -> Tuple[int, int]:
269        """
270        Find the start and end offsets of the metric in the raw output.
271
272        Start searching after the end of the target metric json entry ("overall_rating":), and before the start of the next metric ("some_other_score").
273        """
274        start_offset = metric_offsets[metric] + len(metric)
275
276        # Find the lowest end offset that is greater than the start offset
277        end_offset = len(raw_output)
278        for v in list(metric_offsets.values()):
279            if v < end_offset and v > start_offset:
280                end_offset = v
281
282        return start_offset, end_offset

Find the start and end offsets of the metric in the raw output.

Start searching after the end of the target metric json entry ("overall_rating":), and before the start of the next metric ("some_other_score").

def rating_token_to_score( self, token_logprob: litellm.types.utils.ChatCompletionTokenLogprob) -> float | None:
284    def rating_token_to_score(
285        self, token_logprob: ChatCompletionTokenLogprob
286    ) -> float | None:
287        """
288        Convert a rating token to a score using weighted average of top logprobs.
289
290        Only includes tokens that have valid scores.
291
292        Some cleanup for upper case, whitespace and quotes. LLMs aren't always consistent.
293        """
294        primary_token_score = self.score_from_token_string(token_logprob.token)
295        # check this is a real rating token, it could just be the ": ", "," or whitespace
296        if not primary_token_score:
297            return None
298
299        total_score = 0.0
300        total_probability = 0.0
301        top_logprobs_contains_primary_token = False
302
303        # Process all valid scoring tokens from alternatives
304        for top_logprob in token_logprob.top_logprobs:
305            if top_logprob.token == token_logprob.token:
306                top_logprobs_contains_primary_token = True
307            token_score = self.score_from_token_string(top_logprob.token)
308            if token_score is not None:
309                # Convert logprob to probability
310                probability = math.exp(top_logprob.logprob)
311                total_score += token_score * probability
312                total_probability += probability
313
314        # Weird OpenAI 4o bug - sometimes the primary token is included in the top logprobs, sometimes not.
315        # Add the primary token back in if excluded
316        if not top_logprobs_contains_primary_token:
317            if token_logprob.logprob == -9999.0:
318                # Another "bug" - sometimes the logprob is -9999.0. This seems to happen when the rest of the logprobs are tiny probability.
319                total_score += primary_token_score * 1.0
320                total_probability += 1.0
321            else:
322                probability = math.exp(token_logprob.logprob)
323                total_score += primary_token_score * probability
324                total_probability += probability
325
326        if total_probability <= 0.0:
327            raise RuntimeError(
328                f"No valid scoring tokens found for {token_logprob.token}. This should never happen as the token has a valid score (so it must be excluded from top logprobs). Please file a bug if you see this."
329            )
330
331        # Normalize by total probability of valid tokens (LLM may have wanted to generate other non-rating tokens, these shouldn't lower score of rating tokens)
332        weighted_score = total_score / total_probability
333
334        return weighted_score

Convert a rating token to a score using weighted average of top logprobs.

Only includes tokens that have valid scores.

Some cleanup for upper case, whitespace and quotes. LLMs aren't always consistent.

def score_from_token_string(self, token: str) -> float | None:
336    def score_from_token_string(self, token: str) -> float | None:
337        if token in TOKEN_TO_SCORE_MAP:
338            return TOKEN_TO_SCORE_MAP[token]
339
340        # handle more token variations like '"1"' and '"pass"' and ' paSS' and 'PASS'
341        unquoted_token = token.strip().strip('"').lower()
342        if unquoted_token in TOKEN_TO_SCORE_MAP:
343            return TOKEN_TO_SCORE_MAP[unquoted_token]
344
345        # handle numeric tokens like "1.0"
346        try:
347            float_value = float(token)
348            if float_value.is_integer():
349                str_token = str(int(float_value))
350                if str_token in TOKEN_TO_SCORE_MAP:
351                    return TOKEN_TO_SCORE_MAP[str_token]
352        except ValueError:
353            pass
354
355        return None
def metric_offsets(self, raw_output: str, metrics: List[str]) -> Dict[str, int]:
357    def metric_offsets(self, raw_output: str, metrics: List[str]) -> Dict[str, int]:
358        """
359        Find the offset to the start of each metric in the raw output json
360
361        For the example json: `{"overall_rating": 1}` == 1
362
363        should return:
364        {
365            "overall_rating": 1 # it's 1 character into the json string
366        }
367        """
368        metric_offsets: Dict[str, int] = {}
369        for metric in metrics:
370            # the quoted metric name is expected in the json: `{"overall_rating": 1}` == 1
371            metric_name = f'"{metric}"'
372
373            # we expect it exactly once
374            count = raw_output.count(metric_name)
375            if count != 1:
376                raise ValueError(
377                    f"Metric {metric} should appear exactly once in the output. Found {count} times"
378                )
379
380            offset = raw_output.find(metric_name)
381            if offset == -1:
382                raise ValueError(f"Metric {metric} not found in raw output")
383            metric_offsets[metric] = offset
384        return metric_offsets

Find the offset to the start of each metric in the raw output json

For the example json: {"overall_rating": 1} == 1

should return: { "overall_rating": 1 # it's 1 character into the json string }