kiln_ai.adapters.eval.g_eval

  1import math
  2from typing import Dict, List, Tuple
  3
  4from litellm.types.utils import ChatCompletionTokenLogprob
  5
  6from kiln_ai.adapters.adapter_registry import adapter_for_task
  7from kiln_ai.adapters.eval.base_eval import BaseEval
  8from kiln_ai.adapters.ml_model_list import (
  9    default_structured_output_mode_for_model_provider,
 10)
 11from kiln_ai.adapters.model_adapters.base_adapter import AdapterConfig, RunOutput
 12from kiln_ai.adapters.prompt_builders import PromptGenerators
 13from kiln_ai.datamodel import Project, Task, TaskRun
 14from kiln_ai.datamodel.eval import EvalConfig, EvalConfigType, EvalScores
 15from kiln_ai.datamodel.task import RunConfigProperties, StructuredOutputMode
 16
 17# all the tokens we score for, and their float scores.
 18TOKEN_TO_SCORE_MAP: Dict[str, float] = {
 19    "1": 1.0,
 20    "2": 2.0,
 21    "3": 3.0,
 22    "4": 4.0,
 23    "5": 5.0,
 24    "pass": 1.0,
 25    "fail": 0.0,
 26    "critical": -1.0,
 27}
 28
 29
 30class GEvalTask(Task, parent_of={}):
 31    """
 32    Kiln task for executing a G-Eval. Can be run on any Kiln adapter which supports logprobs.
 33
 34    Note G-Eval implements both G-Eval and LLM as Judge as they are very similar.
 35    """
 36
 37    def __init__(self, eval_config: EvalConfig):
 38        tmp_project = Project(name="GEval")
 39
 40        # Build a simple LLM as Judge system instruction
 41        system_instruction = "Your job to evaluate a model's performance on a task. Blocks will be marked with <eval_data> tags.\n"
 42        # Optionally add a short task description
 43        task_description = eval_config.properties.get("task_description", None)
 44        if task_description:
 45            system_instruction += f"\nThe task the model was given is as follows:\n<eval_data>\n{task_description}\n</eval_data>\n"
 46
 47        # Build the COT eval instructions
 48        cot_instructions = "First, think step by step about the model's performance following these evaluation steps:\n\n"
 49        steps = eval_config.properties.get("eval_steps", [])
 50        if not isinstance(steps, list):
 51            raise ValueError("eval_steps must be a list.")
 52        for i, step in enumerate(steps):
 53            cot_instructions += f"{i + 1}) {step}\n"
 54
 55        eval = eval_config.parent_eval()
 56        if not eval:
 57            raise ValueError("Eval config must have a parent eval")
 58
 59        # Build the output schema from the eval's target output scores.
 60        # We restrict the LLM's output scoring schema to discrete scores (pass/fail/critical/1-5) - allow_float_scores=False
 61        # However, the final scores from the evaluator can be a float (see later logprob calculation, which requires discrete token outputs)
 62        output_schema = BaseEval.build_score_schema(eval, allow_float_scores=False)
 63
 64        super().__init__(
 65            name="GEval Task",
 66            parent=tmp_project,
 67            instruction=system_instruction,
 68            thinking_instruction=cot_instructions,
 69            output_json_schema=output_schema,
 70        )
 71
 72
 73class GEval(BaseEval):
 74    """
 75    A evaluator which implements G-Eval and LLM as Judge.
 76
 77    G-Eval is a method of evaluating the quality of a model's output. It is a weighted average of the scores of the tokens in the output. The weights are the log probabilities of the tokens in the output. https://arxiv.org/abs/2303.16634
 78
 79    LLM as Judge is a method of evaluating the quality of a model's output. It simply asks the LLM to score, and uses the returned output (no logprobs needed). Also called direct evaluation.
 80
 81    @misc{liu2023gevalnlgevaluationusing,
 82        title={G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment},
 83        author={Yang Liu and Dan Iter and Yichong Xu and Shuohang Wang and Ruochen Xu and Chenguang Zhu},
 84        year={2023},
 85        eprint={2303.16634},
 86        archivePrefix={arXiv},
 87        primaryClass={cs.CL},
 88        url={https://arxiv.org/abs/2303.16634},
 89    }
 90    """
 91
 92    def __init__(self, eval_config: EvalConfig, run_config: RunConfigProperties | None):
 93        if (
 94            eval_config.config_type != EvalConfigType.g_eval
 95            and eval_config.config_type != EvalConfigType.llm_as_judge
 96        ):
 97            raise ValueError(
 98                f"GEval must be initialized with a GEval or LLM as Judge config_type. Got {eval_config.config_type}"
 99            )
100
101        super().__init__(eval_config, run_config)
102
103        self.geval_task = GEvalTask(eval_config)
104
105    def generate_run_description(self, eval_input: str, eval_output: str) -> str:
106        return f"""The model was given the following input for the task: 
107<eval_data>
108{eval_input}
109</eval_data>
110
111The model produced the following output for the task:
112<eval_data>
113{eval_output}
114</eval_data>
115"""
116
117    async def run_eval(
118        self, task_run: TaskRun
119    ) -> tuple[EvalScores, Dict[str, str] | None]:
120        """
121        Run this eval on the given task run.
122        """
123
124        model_name, provider = self.model_and_provider()
125
126        # Only fetch logprobs for G-Eval
127        # There are at most 5 valid rating tokens per rating type (five_star being largest), so 10 is more than enough to get to the very very unlikely
128        top_logprobs = (
129            10 if self.eval_config.config_type == EvalConfigType.g_eval else None
130        )
131
132        # We don't expose setting this manually in the UI, so pull a recommended mode from ml_model_list
133        structured_output_mode = default_structured_output_mode_for_model_provider(
134            model_name,
135            provider,
136            default=StructuredOutputMode.json_schema,
137            # G-eval expects JSON, so don't allow function calling modes
138            disallowed_modes=[
139                StructuredOutputMode.function_calling,
140                StructuredOutputMode.function_calling_weak,
141            ],
142        )
143
144        adapter = adapter_for_task(
145            self.geval_task,
146            run_config_properties=RunConfigProperties(
147                model_name=model_name,
148                model_provider_name=provider,
149                # We always use Simple COT for G-Eval and LLM as Judge
150                prompt_id=PromptGenerators.SIMPLE_CHAIN_OF_THOUGHT,
151                structured_output_mode=structured_output_mode,
152            ),
153            base_adapter_config=AdapterConfig(
154                # Don't save this run into the task_runs. It will be saved into an eval_run where it belongs
155                allow_saving=False,
156                top_logprobs=top_logprobs,
157            ),
158        )
159
160        run_description = self.generate_run_description(
161            task_run.input, task_run.output.output
162        )
163
164        # We don't need the run, but invoke_returning_run_output() runs validations for us over _run()
165        _, run_output = await adapter.invoke_returning_run_output(run_description)
166
167        if self.eval_config.config_type == EvalConfigType.llm_as_judge:
168            return self.build_llm_as_judge_score(
169                run_output
170            ), run_output.intermediate_outputs
171        else:
172            return self.build_g_eval_score(run_output), run_output.intermediate_outputs
173
174    def build_llm_as_judge_score(self, run_output: RunOutput) -> EvalScores:
175        """
176        Build the LLM as Judge score for the given run and run output.
177        """
178        # Convert the output format we asked for (discreet values) to our float scores
179        scores: EvalScores = {}
180        if not isinstance(run_output.output, dict):
181            raise ValueError("LLM as Judge output must be a dictionary")
182
183        for metric, score in run_output.output.items():
184            token_score = self.score_from_token_string(f"{score}")
185            if token_score is None:
186                raise ValueError(
187                    f"No score found for metric: {metric}. The LLM failed to follow the scoring rubric/instructions/schema."
188                )
189            scores[metric] = token_score
190        return scores
191
192    def build_g_eval_score(self, run_output: RunOutput) -> EvalScores:
193        """
194        Build the G-Eval score for the given run and run output.
195
196        We create a weighted average of each rating using the logprobs.
197
198        @misc{liu2023gevalnlgevaluationusing,
199            title={G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment},
200            author={Yang Liu and Dan Iter and Yichong Xu and Shuohang Wang and Ruochen Xu and Chenguang Zhu},
201            year={2023},
202            eprint={2303.16634},
203            archivePrefix={arXiv},
204            primaryClass={cs.CL},
205            url={https://arxiv.org/abs/2303.16634},
206        }
207        """
208        # We use structured output
209        outputs = run_output.output
210        assert isinstance(outputs, dict)
211
212        # Build raw string output from the logprobs, which is easier to work with than Dict for the next bit
213        raw_output = self.raw_output_from_logprobs(run_output)
214
215        # find the offset the start of each metric in the raw output json
216        metrics: List[str] = list(outputs.keys())
217        metric_offsets = self.metric_offsets(raw_output, metrics)
218
219        final_scores: EvalScores = {}
220        for metric in metrics:
221            score = self.g_eval_single_metric(
222                run_output, metric, metric_offsets, raw_output
223            )
224            if score is None:
225                raise ValueError(
226                    f"No score found for metric: {metric}. The LLM failed to follow the scoring rubric/instructions/schema."
227                )
228            final_scores[metric] = score
229
230        return final_scores
231
232    def g_eval_single_metric(
233        self,
234        run_output: RunOutput,
235        metric: str,
236        metric_offsets: Dict[str, int],
237        raw_output: str,
238    ) -> float | None:
239        """
240        Run the G-Eval for a single metric.
241
242        Scan the logprobs for the metric and return the weighted score of the rating token.
243        """
244
245        start_offset, end_offset = self.token_search_range(
246            raw_output, metric, metric_offsets
247        )
248
249        offset = 0
250
251        if (
252            run_output.output_logprobs is None
253            or run_output.output_logprobs.content is None
254        ):
255            raise RuntimeError(
256                "No logprobs found for output - can not calculate g-eval"
257            )
258
259        # scan the tokens in the range, looking for the rating token
260        for _, chat_logprob in enumerate(run_output.output_logprobs.content):
261            if offset >= end_offset:
262                break
263            if offset >= start_offset:
264                score = self.rating_token_to_score(chat_logprob)
265                if score is not None:
266                    return score
267            offset += len(chat_logprob.token)
268
269        return None
270
271    def raw_output_from_logprobs(self, run_output: RunOutput) -> str:
272        """
273        Build the raw output string from the logprobs. Generate from logprobs so it's guaranteed to match the logprobs offsets
274        """
275        if (
276            run_output.output_logprobs is None
277            or run_output.output_logprobs.content is None
278        ):
279            raise RuntimeError(
280                "No logprobs found for output - can not calculate g-eval"
281            )
282
283        raw = ""
284        for chat_logprob in run_output.output_logprobs.content:
285            raw += chat_logprob.token
286        return raw
287
288    def token_search_range(
289        self, raw_output: str, metric: str, metric_offsets: Dict[str, int]
290    ) -> Tuple[int, int]:
291        """
292        Find the start and end offsets of the metric in the raw output.
293
294        Start searching after the end of the target metric json entry ("overall_rating":), and before the start of the next metric ("some_other_score").
295        """
296        start_offset = metric_offsets[metric] + len(metric)
297
298        # Find the lowest end offset that is greater than the start offset
299        end_offset = len(raw_output)
300        for v in list(metric_offsets.values()):
301            if v < end_offset and v > start_offset:
302                end_offset = v
303
304        return start_offset, end_offset
305
306    def rating_token_to_score(
307        self, token_logprob: ChatCompletionTokenLogprob
308    ) -> float | None:
309        """
310        Convert a rating token to a score using weighted average of top logprobs.
311
312        Only includes tokens that have valid scores.
313
314        Some cleanup for upper case, whitespace and quotes. LLMs aren't always consistent.
315        """
316        primary_token_score = self.score_from_token_string(token_logprob.token)
317        # check this is a real rating token, it could just be the ": ", "," or whitespace
318        if primary_token_score is None:
319            return None
320
321        total_score = 0.0
322        total_probability = 0.0
323        top_logprobs_contains_primary_token = False
324
325        # Process all valid scoring tokens from alternatives
326        for top_logprob in token_logprob.top_logprobs:
327            if top_logprob.token == token_logprob.token:
328                top_logprobs_contains_primary_token = True
329            token_score = self.score_from_token_string(top_logprob.token)
330            if token_score is not None:
331                # Convert logprob to probability
332                probability = math.exp(top_logprob.logprob)
333                total_score += token_score * probability
334                total_probability += probability
335
336        # Weird OpenAI 4o bug - sometimes the primary token is included in the top logprobs, sometimes not.
337        # Add the primary token back in if excluded
338        if not top_logprobs_contains_primary_token:
339            if token_logprob.logprob == -9999.0:
340                # Another "bug" - sometimes the logprob is -9999.0. This seems to happen when the rest of the logprobs are tiny probability.
341                total_score += primary_token_score * 1.0
342                total_probability += 1.0
343            else:
344                probability = math.exp(token_logprob.logprob)
345                total_score += primary_token_score * probability
346                total_probability += probability
347
348        if total_probability <= 0.0:
349            raise RuntimeError(
350                f"No valid scoring tokens found for {token_logprob.token}. This should never happen as the token has a valid score (so it must be excluded from top logprobs). Please file a bug if you see this."
351            )
352
353        # Normalize by total probability of valid tokens (LLM may have wanted to generate other non-rating tokens, these shouldn't lower score of rating tokens)
354        weighted_score = total_score / total_probability
355
356        return weighted_score
357
358    def score_from_token_string(self, token: str) -> float | None:
359        if token in TOKEN_TO_SCORE_MAP:
360            return TOKEN_TO_SCORE_MAP[token]
361
362        # handle more token variations like '"1"' and '"pass"' and ' paSS' and 'PASS'
363        unquoted_token = token.strip().strip('"').lower()
364        if unquoted_token in TOKEN_TO_SCORE_MAP:
365            return TOKEN_TO_SCORE_MAP[unquoted_token]
366
367        # handle numeric tokens like "1.0"
368        try:
369            float_value = float(token)
370            if float_value.is_integer():
371                str_token = str(int(float_value))
372                if str_token in TOKEN_TO_SCORE_MAP:
373                    return TOKEN_TO_SCORE_MAP[str_token]
374        except ValueError:
375            pass
376
377        return None
378
379    def metric_offsets(self, raw_output: str, metrics: List[str]) -> Dict[str, int]:
380        """
381        Find the offset to the start of each metric in the raw output json
382
383        For the example json: `{"overall_rating": 1}` == 1
384
385        should return:
386        {
387            "overall_rating": 1 # it's 1 character into the json string
388        }
389        """
390        metric_offsets: Dict[str, int] = {}
391        for metric in metrics:
392            # the quoted metric name is expected in the json: `{"overall_rating": 1}` == 1
393            metric_name = f'"{metric}"'
394
395            # we expect it exactly once
396            count = raw_output.count(metric_name)
397            if count != 1:
398                raise ValueError(
399                    f"Metric {metric} should appear exactly once in the output. Found {count} times"
400                )
401
402            offset = raw_output.find(metric_name)
403            if offset == -1:
404                raise ValueError(f"Metric {metric} not found in raw output")
405            metric_offsets[metric] = offset
406        return metric_offsets
TOKEN_TO_SCORE_MAP: Dict[str, float] = {'1': 1.0, '2': 2.0, '3': 3.0, '4': 4.0, '5': 5.0, 'pass': 1.0, 'fail': 0.0, 'critical': -1.0}
class GEvalTask(kiln_ai.datamodel.task.Task):
31class GEvalTask(Task, parent_of={}):
32    """
33    Kiln task for executing a G-Eval. Can be run on any Kiln adapter which supports logprobs.
34
35    Note G-Eval implements both G-Eval and LLM as Judge as they are very similar.
36    """
37
38    def __init__(self, eval_config: EvalConfig):
39        tmp_project = Project(name="GEval")
40
41        # Build a simple LLM as Judge system instruction
42        system_instruction = "Your job to evaluate a model's performance on a task. Blocks will be marked with <eval_data> tags.\n"
43        # Optionally add a short task description
44        task_description = eval_config.properties.get("task_description", None)
45        if task_description:
46            system_instruction += f"\nThe task the model was given is as follows:\n<eval_data>\n{task_description}\n</eval_data>\n"
47
48        # Build the COT eval instructions
49        cot_instructions = "First, think step by step about the model's performance following these evaluation steps:\n\n"
50        steps = eval_config.properties.get("eval_steps", [])
51        if not isinstance(steps, list):
52            raise ValueError("eval_steps must be a list.")
53        for i, step in enumerate(steps):
54            cot_instructions += f"{i + 1}) {step}\n"
55
56        eval = eval_config.parent_eval()
57        if not eval:
58            raise ValueError("Eval config must have a parent eval")
59
60        # Build the output schema from the eval's target output scores.
61        # We restrict the LLM's output scoring schema to discrete scores (pass/fail/critical/1-5) - allow_float_scores=False
62        # However, the final scores from the evaluator can be a float (see later logprob calculation, which requires discrete token outputs)
63        output_schema = BaseEval.build_score_schema(eval, allow_float_scores=False)
64
65        super().__init__(
66            name="GEval Task",
67            parent=tmp_project,
68            instruction=system_instruction,
69            thinking_instruction=cot_instructions,
70            output_json_schema=output_schema,
71        )

Kiln task for executing a G-Eval. Can be run on any Kiln adapter which supports logprobs.

Note G-Eval implements both G-Eval and LLM as Judge as they are very similar.

GEvalTask(eval_config: kiln_ai.datamodel.eval.EvalConfig)
38    def __init__(self, eval_config: EvalConfig):
39        tmp_project = Project(name="GEval")
40
41        # Build a simple LLM as Judge system instruction
42        system_instruction = "Your job to evaluate a model's performance on a task. Blocks will be marked with <eval_data> tags.\n"
43        # Optionally add a short task description
44        task_description = eval_config.properties.get("task_description", None)
45        if task_description:
46            system_instruction += f"\nThe task the model was given is as follows:\n<eval_data>\n{task_description}\n</eval_data>\n"
47
48        # Build the COT eval instructions
49        cot_instructions = "First, think step by step about the model's performance following these evaluation steps:\n\n"
50        steps = eval_config.properties.get("eval_steps", [])
51        if not isinstance(steps, list):
52            raise ValueError("eval_steps must be a list.")
53        for i, step in enumerate(steps):
54            cot_instructions += f"{i + 1}) {step}\n"
55
56        eval = eval_config.parent_eval()
57        if not eval:
58            raise ValueError("Eval config must have a parent eval")
59
60        # Build the output schema from the eval's target output scores.
61        # We restrict the LLM's output scoring schema to discrete scores (pass/fail/critical/1-5) - allow_float_scores=False
62        # However, the final scores from the evaluator can be a float (see later logprob calculation, which requires discrete token outputs)
63        output_schema = BaseEval.build_score_schema(eval, allow_float_scores=False)
64
65        super().__init__(
66            name="GEval Task",
67            parent=tmp_project,
68            instruction=system_instruction,
69            thinking_instruction=cot_instructions,
70            output_json_schema=output_schema,
71        )

Create a new model by parsing and validating input data from keyword arguments.

Raises [ValidationError][pydantic_core.ValidationError] if the input data cannot be validated to form a valid model.

self is explicitly positional-only to allow self as a field name.

model_config = {'validate_assignment': True}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

def model_post_init(self: pydantic.main.BaseModel, context: Any, /) -> None:
122                    def wrapped_model_post_init(self: BaseModel, context: Any, /) -> None:
123                        """We need to both initialize private attributes and call the user-defined model_post_init
124                        method.
125                        """
126                        init_private_attributes(self, context)
127                        original_model_post_init(self, context)

We need to both initialize private attributes and call the user-defined model_post_init method.

class GEval(kiln_ai.adapters.eval.base_eval.BaseEval):
 74class GEval(BaseEval):
 75    """
 76    A evaluator which implements G-Eval and LLM as Judge.
 77
 78    G-Eval is a method of evaluating the quality of a model's output. It is a weighted average of the scores of the tokens in the output. The weights are the log probabilities of the tokens in the output. https://arxiv.org/abs/2303.16634
 79
 80    LLM as Judge is a method of evaluating the quality of a model's output. It simply asks the LLM to score, and uses the returned output (no logprobs needed). Also called direct evaluation.
 81
 82    @misc{liu2023gevalnlgevaluationusing,
 83        title={G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment},
 84        author={Yang Liu and Dan Iter and Yichong Xu and Shuohang Wang and Ruochen Xu and Chenguang Zhu},
 85        year={2023},
 86        eprint={2303.16634},
 87        archivePrefix={arXiv},
 88        primaryClass={cs.CL},
 89        url={https://arxiv.org/abs/2303.16634},
 90    }
 91    """
 92
 93    def __init__(self, eval_config: EvalConfig, run_config: RunConfigProperties | None):
 94        if (
 95            eval_config.config_type != EvalConfigType.g_eval
 96            and eval_config.config_type != EvalConfigType.llm_as_judge
 97        ):
 98            raise ValueError(
 99                f"GEval must be initialized with a GEval or LLM as Judge config_type. Got {eval_config.config_type}"
100            )
101
102        super().__init__(eval_config, run_config)
103
104        self.geval_task = GEvalTask(eval_config)
105
106    def generate_run_description(self, eval_input: str, eval_output: str) -> str:
107        return f"""The model was given the following input for the task: 
108<eval_data>
109{eval_input}
110</eval_data>
111
112The model produced the following output for the task:
113<eval_data>
114{eval_output}
115</eval_data>
116"""
117
118    async def run_eval(
119        self, task_run: TaskRun
120    ) -> tuple[EvalScores, Dict[str, str] | None]:
121        """
122        Run this eval on the given task run.
123        """
124
125        model_name, provider = self.model_and_provider()
126
127        # Only fetch logprobs for G-Eval
128        # There are at most 5 valid rating tokens per rating type (five_star being largest), so 10 is more than enough to get to the very very unlikely
129        top_logprobs = (
130            10 if self.eval_config.config_type == EvalConfigType.g_eval else None
131        )
132
133        # We don't expose setting this manually in the UI, so pull a recommended mode from ml_model_list
134        structured_output_mode = default_structured_output_mode_for_model_provider(
135            model_name,
136            provider,
137            default=StructuredOutputMode.json_schema,
138            # G-eval expects JSON, so don't allow function calling modes
139            disallowed_modes=[
140                StructuredOutputMode.function_calling,
141                StructuredOutputMode.function_calling_weak,
142            ],
143        )
144
145        adapter = adapter_for_task(
146            self.geval_task,
147            run_config_properties=RunConfigProperties(
148                model_name=model_name,
149                model_provider_name=provider,
150                # We always use Simple COT for G-Eval and LLM as Judge
151                prompt_id=PromptGenerators.SIMPLE_CHAIN_OF_THOUGHT,
152                structured_output_mode=structured_output_mode,
153            ),
154            base_adapter_config=AdapterConfig(
155                # Don't save this run into the task_runs. It will be saved into an eval_run where it belongs
156                allow_saving=False,
157                top_logprobs=top_logprobs,
158            ),
159        )
160
161        run_description = self.generate_run_description(
162            task_run.input, task_run.output.output
163        )
164
165        # We don't need the run, but invoke_returning_run_output() runs validations for us over _run()
166        _, run_output = await adapter.invoke_returning_run_output(run_description)
167
168        if self.eval_config.config_type == EvalConfigType.llm_as_judge:
169            return self.build_llm_as_judge_score(
170                run_output
171            ), run_output.intermediate_outputs
172        else:
173            return self.build_g_eval_score(run_output), run_output.intermediate_outputs
174
175    def build_llm_as_judge_score(self, run_output: RunOutput) -> EvalScores:
176        """
177        Build the LLM as Judge score for the given run and run output.
178        """
179        # Convert the output format we asked for (discreet values) to our float scores
180        scores: EvalScores = {}
181        if not isinstance(run_output.output, dict):
182            raise ValueError("LLM as Judge output must be a dictionary")
183
184        for metric, score in run_output.output.items():
185            token_score = self.score_from_token_string(f"{score}")
186            if token_score is None:
187                raise ValueError(
188                    f"No score found for metric: {metric}. The LLM failed to follow the scoring rubric/instructions/schema."
189                )
190            scores[metric] = token_score
191        return scores
192
193    def build_g_eval_score(self, run_output: RunOutput) -> EvalScores:
194        """
195        Build the G-Eval score for the given run and run output.
196
197        We create a weighted average of each rating using the logprobs.
198
199        @misc{liu2023gevalnlgevaluationusing,
200            title={G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment},
201            author={Yang Liu and Dan Iter and Yichong Xu and Shuohang Wang and Ruochen Xu and Chenguang Zhu},
202            year={2023},
203            eprint={2303.16634},
204            archivePrefix={arXiv},
205            primaryClass={cs.CL},
206            url={https://arxiv.org/abs/2303.16634},
207        }
208        """
209        # We use structured output
210        outputs = run_output.output
211        assert isinstance(outputs, dict)
212
213        # Build raw string output from the logprobs, which is easier to work with than Dict for the next bit
214        raw_output = self.raw_output_from_logprobs(run_output)
215
216        # find the offset the start of each metric in the raw output json
217        metrics: List[str] = list(outputs.keys())
218        metric_offsets = self.metric_offsets(raw_output, metrics)
219
220        final_scores: EvalScores = {}
221        for metric in metrics:
222            score = self.g_eval_single_metric(
223                run_output, metric, metric_offsets, raw_output
224            )
225            if score is None:
226                raise ValueError(
227                    f"No score found for metric: {metric}. The LLM failed to follow the scoring rubric/instructions/schema."
228                )
229            final_scores[metric] = score
230
231        return final_scores
232
233    def g_eval_single_metric(
234        self,
235        run_output: RunOutput,
236        metric: str,
237        metric_offsets: Dict[str, int],
238        raw_output: str,
239    ) -> float | None:
240        """
241        Run the G-Eval for a single metric.
242
243        Scan the logprobs for the metric and return the weighted score of the rating token.
244        """
245
246        start_offset, end_offset = self.token_search_range(
247            raw_output, metric, metric_offsets
248        )
249
250        offset = 0
251
252        if (
253            run_output.output_logprobs is None
254            or run_output.output_logprobs.content is None
255        ):
256            raise RuntimeError(
257                "No logprobs found for output - can not calculate g-eval"
258            )
259
260        # scan the tokens in the range, looking for the rating token
261        for _, chat_logprob in enumerate(run_output.output_logprobs.content):
262            if offset >= end_offset:
263                break
264            if offset >= start_offset:
265                score = self.rating_token_to_score(chat_logprob)
266                if score is not None:
267                    return score
268            offset += len(chat_logprob.token)
269
270        return None
271
272    def raw_output_from_logprobs(self, run_output: RunOutput) -> str:
273        """
274        Build the raw output string from the logprobs. Generate from logprobs so it's guaranteed to match the logprobs offsets
275        """
276        if (
277            run_output.output_logprobs is None
278            or run_output.output_logprobs.content is None
279        ):
280            raise RuntimeError(
281                "No logprobs found for output - can not calculate g-eval"
282            )
283
284        raw = ""
285        for chat_logprob in run_output.output_logprobs.content:
286            raw += chat_logprob.token
287        return raw
288
289    def token_search_range(
290        self, raw_output: str, metric: str, metric_offsets: Dict[str, int]
291    ) -> Tuple[int, int]:
292        """
293        Find the start and end offsets of the metric in the raw output.
294
295        Start searching after the end of the target metric json entry ("overall_rating":), and before the start of the next metric ("some_other_score").
296        """
297        start_offset = metric_offsets[metric] + len(metric)
298
299        # Find the lowest end offset that is greater than the start offset
300        end_offset = len(raw_output)
301        for v in list(metric_offsets.values()):
302            if v < end_offset and v > start_offset:
303                end_offset = v
304
305        return start_offset, end_offset
306
307    def rating_token_to_score(
308        self, token_logprob: ChatCompletionTokenLogprob
309    ) -> float | None:
310        """
311        Convert a rating token to a score using weighted average of top logprobs.
312
313        Only includes tokens that have valid scores.
314
315        Some cleanup for upper case, whitespace and quotes. LLMs aren't always consistent.
316        """
317        primary_token_score = self.score_from_token_string(token_logprob.token)
318        # check this is a real rating token, it could just be the ": ", "," or whitespace
319        if primary_token_score is None:
320            return None
321
322        total_score = 0.0
323        total_probability = 0.0
324        top_logprobs_contains_primary_token = False
325
326        # Process all valid scoring tokens from alternatives
327        for top_logprob in token_logprob.top_logprobs:
328            if top_logprob.token == token_logprob.token:
329                top_logprobs_contains_primary_token = True
330            token_score = self.score_from_token_string(top_logprob.token)
331            if token_score is not None:
332                # Convert logprob to probability
333                probability = math.exp(top_logprob.logprob)
334                total_score += token_score * probability
335                total_probability += probability
336
337        # Weird OpenAI 4o bug - sometimes the primary token is included in the top logprobs, sometimes not.
338        # Add the primary token back in if excluded
339        if not top_logprobs_contains_primary_token:
340            if token_logprob.logprob == -9999.0:
341                # Another "bug" - sometimes the logprob is -9999.0. This seems to happen when the rest of the logprobs are tiny probability.
342                total_score += primary_token_score * 1.0
343                total_probability += 1.0
344            else:
345                probability = math.exp(token_logprob.logprob)
346                total_score += primary_token_score * probability
347                total_probability += probability
348
349        if total_probability <= 0.0:
350            raise RuntimeError(
351                f"No valid scoring tokens found for {token_logprob.token}. This should never happen as the token has a valid score (so it must be excluded from top logprobs). Please file a bug if you see this."
352            )
353
354        # Normalize by total probability of valid tokens (LLM may have wanted to generate other non-rating tokens, these shouldn't lower score of rating tokens)
355        weighted_score = total_score / total_probability
356
357        return weighted_score
358
359    def score_from_token_string(self, token: str) -> float | None:
360        if token in TOKEN_TO_SCORE_MAP:
361            return TOKEN_TO_SCORE_MAP[token]
362
363        # handle more token variations like '"1"' and '"pass"' and ' paSS' and 'PASS'
364        unquoted_token = token.strip().strip('"').lower()
365        if unquoted_token in TOKEN_TO_SCORE_MAP:
366            return TOKEN_TO_SCORE_MAP[unquoted_token]
367
368        # handle numeric tokens like "1.0"
369        try:
370            float_value = float(token)
371            if float_value.is_integer():
372                str_token = str(int(float_value))
373                if str_token in TOKEN_TO_SCORE_MAP:
374                    return TOKEN_TO_SCORE_MAP[str_token]
375        except ValueError:
376            pass
377
378        return None
379
380    def metric_offsets(self, raw_output: str, metrics: List[str]) -> Dict[str, int]:
381        """
382        Find the offset to the start of each metric in the raw output json
383
384        For the example json: `{"overall_rating": 1}` == 1
385
386        should return:
387        {
388            "overall_rating": 1 # it's 1 character into the json string
389        }
390        """
391        metric_offsets: Dict[str, int] = {}
392        for metric in metrics:
393            # the quoted metric name is expected in the json: `{"overall_rating": 1}` == 1
394            metric_name = f'"{metric}"'
395
396            # we expect it exactly once
397            count = raw_output.count(metric_name)
398            if count != 1:
399                raise ValueError(
400                    f"Metric {metric} should appear exactly once in the output. Found {count} times"
401                )
402
403            offset = raw_output.find(metric_name)
404            if offset == -1:
405                raise ValueError(f"Metric {metric} not found in raw output")
406            metric_offsets[metric] = offset
407        return metric_offsets

A evaluator which implements G-Eval and LLM as Judge.

G-Eval is a method of evaluating the quality of a model's output. It is a weighted average of the scores of the tokens in the output. The weights are the log probabilities of the tokens in the output. https://arxiv.org/abs/2303.16634

LLM as Judge is a method of evaluating the quality of a model's output. It simply asks the LLM to score, and uses the returned output (no logprobs needed). Also called direct evaluation.

@misc{liu2023gevalnlgevaluationusing, title={G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment}, author={Yang Liu and Dan Iter and Yichong Xu and Shuohang Wang and Ruochen Xu and Chenguang Zhu}, year={2023}, eprint={2303.16634}, archivePrefix={arXiv}, primaryClass={cs.CL}, url={https://arxiv.org/abs/2303.16634}, }

GEval( eval_config: kiln_ai.datamodel.eval.EvalConfig, run_config: kiln_ai.datamodel.run_config.RunConfigProperties | None)
 93    def __init__(self, eval_config: EvalConfig, run_config: RunConfigProperties | None):
 94        if (
 95            eval_config.config_type != EvalConfigType.g_eval
 96            and eval_config.config_type != EvalConfigType.llm_as_judge
 97        ):
 98            raise ValueError(
 99                f"GEval must be initialized with a GEval or LLM as Judge config_type. Got {eval_config.config_type}"
100            )
101
102        super().__init__(eval_config, run_config)
103
104        self.geval_task = GEvalTask(eval_config)
geval_task
def generate_run_description(self, eval_input: str, eval_output: str) -> str:
106    def generate_run_description(self, eval_input: str, eval_output: str) -> str:
107        return f"""The model was given the following input for the task: 
108<eval_data>
109{eval_input}
110</eval_data>
111
112The model produced the following output for the task:
113<eval_data>
114{eval_output}
115</eval_data>
116"""
async def run_eval( self, task_run: kiln_ai.datamodel.TaskRun) -> tuple[typing.Dict[str, float], typing.Optional[typing.Dict[str, str]]]:
118    async def run_eval(
119        self, task_run: TaskRun
120    ) -> tuple[EvalScores, Dict[str, str] | None]:
121        """
122        Run this eval on the given task run.
123        """
124
125        model_name, provider = self.model_and_provider()
126
127        # Only fetch logprobs for G-Eval
128        # There are at most 5 valid rating tokens per rating type (five_star being largest), so 10 is more than enough to get to the very very unlikely
129        top_logprobs = (
130            10 if self.eval_config.config_type == EvalConfigType.g_eval else None
131        )
132
133        # We don't expose setting this manually in the UI, so pull a recommended mode from ml_model_list
134        structured_output_mode = default_structured_output_mode_for_model_provider(
135            model_name,
136            provider,
137            default=StructuredOutputMode.json_schema,
138            # G-eval expects JSON, so don't allow function calling modes
139            disallowed_modes=[
140                StructuredOutputMode.function_calling,
141                StructuredOutputMode.function_calling_weak,
142            ],
143        )
144
145        adapter = adapter_for_task(
146            self.geval_task,
147            run_config_properties=RunConfigProperties(
148                model_name=model_name,
149                model_provider_name=provider,
150                # We always use Simple COT for G-Eval and LLM as Judge
151                prompt_id=PromptGenerators.SIMPLE_CHAIN_OF_THOUGHT,
152                structured_output_mode=structured_output_mode,
153            ),
154            base_adapter_config=AdapterConfig(
155                # Don't save this run into the task_runs. It will be saved into an eval_run where it belongs
156                allow_saving=False,
157                top_logprobs=top_logprobs,
158            ),
159        )
160
161        run_description = self.generate_run_description(
162            task_run.input, task_run.output.output
163        )
164
165        # We don't need the run, but invoke_returning_run_output() runs validations for us over _run()
166        _, run_output = await adapter.invoke_returning_run_output(run_description)
167
168        if self.eval_config.config_type == EvalConfigType.llm_as_judge:
169            return self.build_llm_as_judge_score(
170                run_output
171            ), run_output.intermediate_outputs
172        else:
173            return self.build_g_eval_score(run_output), run_output.intermediate_outputs

Run this eval on the given task run.

def build_llm_as_judge_score( self, run_output: kiln_ai.adapters.run_output.RunOutput) -> Dict[str, float]:
175    def build_llm_as_judge_score(self, run_output: RunOutput) -> EvalScores:
176        """
177        Build the LLM as Judge score for the given run and run output.
178        """
179        # Convert the output format we asked for (discreet values) to our float scores
180        scores: EvalScores = {}
181        if not isinstance(run_output.output, dict):
182            raise ValueError("LLM as Judge output must be a dictionary")
183
184        for metric, score in run_output.output.items():
185            token_score = self.score_from_token_string(f"{score}")
186            if token_score is None:
187                raise ValueError(
188                    f"No score found for metric: {metric}. The LLM failed to follow the scoring rubric/instructions/schema."
189                )
190            scores[metric] = token_score
191        return scores

Build the LLM as Judge score for the given run and run output.

def build_g_eval_score( self, run_output: kiln_ai.adapters.run_output.RunOutput) -> Dict[str, float]:
193    def build_g_eval_score(self, run_output: RunOutput) -> EvalScores:
194        """
195        Build the G-Eval score for the given run and run output.
196
197        We create a weighted average of each rating using the logprobs.
198
199        @misc{liu2023gevalnlgevaluationusing,
200            title={G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment},
201            author={Yang Liu and Dan Iter and Yichong Xu and Shuohang Wang and Ruochen Xu and Chenguang Zhu},
202            year={2023},
203            eprint={2303.16634},
204            archivePrefix={arXiv},
205            primaryClass={cs.CL},
206            url={https://arxiv.org/abs/2303.16634},
207        }
208        """
209        # We use structured output
210        outputs = run_output.output
211        assert isinstance(outputs, dict)
212
213        # Build raw string output from the logprobs, which is easier to work with than Dict for the next bit
214        raw_output = self.raw_output_from_logprobs(run_output)
215
216        # find the offset the start of each metric in the raw output json
217        metrics: List[str] = list(outputs.keys())
218        metric_offsets = self.metric_offsets(raw_output, metrics)
219
220        final_scores: EvalScores = {}
221        for metric in metrics:
222            score = self.g_eval_single_metric(
223                run_output, metric, metric_offsets, raw_output
224            )
225            if score is None:
226                raise ValueError(
227                    f"No score found for metric: {metric}. The LLM failed to follow the scoring rubric/instructions/schema."
228                )
229            final_scores[metric] = score
230
231        return final_scores

Build the G-Eval score for the given run and run output.

We create a weighted average of each rating using the logprobs.

@misc{liu2023gevalnlgevaluationusing, title={G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment}, author={Yang Liu and Dan Iter and Yichong Xu and Shuohang Wang and Ruochen Xu and Chenguang Zhu}, year={2023}, eprint={2303.16634}, archivePrefix={arXiv}, primaryClass={cs.CL}, url={https://arxiv.org/abs/2303.16634}, }

def g_eval_single_metric( self, run_output: kiln_ai.adapters.run_output.RunOutput, metric: str, metric_offsets: Dict[str, int], raw_output: str) -> float | None:
233    def g_eval_single_metric(
234        self,
235        run_output: RunOutput,
236        metric: str,
237        metric_offsets: Dict[str, int],
238        raw_output: str,
239    ) -> float | None:
240        """
241        Run the G-Eval for a single metric.
242
243        Scan the logprobs for the metric and return the weighted score of the rating token.
244        """
245
246        start_offset, end_offset = self.token_search_range(
247            raw_output, metric, metric_offsets
248        )
249
250        offset = 0
251
252        if (
253            run_output.output_logprobs is None
254            or run_output.output_logprobs.content is None
255        ):
256            raise RuntimeError(
257                "No logprobs found for output - can not calculate g-eval"
258            )
259
260        # scan the tokens in the range, looking for the rating token
261        for _, chat_logprob in enumerate(run_output.output_logprobs.content):
262            if offset >= end_offset:
263                break
264            if offset >= start_offset:
265                score = self.rating_token_to_score(chat_logprob)
266                if score is not None:
267                    return score
268            offset += len(chat_logprob.token)
269
270        return None

Run the G-Eval for a single metric.

Scan the logprobs for the metric and return the weighted score of the rating token.

def raw_output_from_logprobs(self, run_output: kiln_ai.adapters.run_output.RunOutput) -> str:
272    def raw_output_from_logprobs(self, run_output: RunOutput) -> str:
273        """
274        Build the raw output string from the logprobs. Generate from logprobs so it's guaranteed to match the logprobs offsets
275        """
276        if (
277            run_output.output_logprobs is None
278            or run_output.output_logprobs.content is None
279        ):
280            raise RuntimeError(
281                "No logprobs found for output - can not calculate g-eval"
282            )
283
284        raw = ""
285        for chat_logprob in run_output.output_logprobs.content:
286            raw += chat_logprob.token
287        return raw

Build the raw output string from the logprobs. Generate from logprobs so it's guaranteed to match the logprobs offsets

def token_search_range( self, raw_output: str, metric: str, metric_offsets: Dict[str, int]) -> Tuple[int, int]:
289    def token_search_range(
290        self, raw_output: str, metric: str, metric_offsets: Dict[str, int]
291    ) -> Tuple[int, int]:
292        """
293        Find the start and end offsets of the metric in the raw output.
294
295        Start searching after the end of the target metric json entry ("overall_rating":), and before the start of the next metric ("some_other_score").
296        """
297        start_offset = metric_offsets[metric] + len(metric)
298
299        # Find the lowest end offset that is greater than the start offset
300        end_offset = len(raw_output)
301        for v in list(metric_offsets.values()):
302            if v < end_offset and v > start_offset:
303                end_offset = v
304
305        return start_offset, end_offset

Find the start and end offsets of the metric in the raw output.

Start searching after the end of the target metric json entry ("overall_rating":), and before the start of the next metric ("some_other_score").

def rating_token_to_score( self, token_logprob: litellm.types.utils.ChatCompletionTokenLogprob) -> float | None:
307    def rating_token_to_score(
308        self, token_logprob: ChatCompletionTokenLogprob
309    ) -> float | None:
310        """
311        Convert a rating token to a score using weighted average of top logprobs.
312
313        Only includes tokens that have valid scores.
314
315        Some cleanup for upper case, whitespace and quotes. LLMs aren't always consistent.
316        """
317        primary_token_score = self.score_from_token_string(token_logprob.token)
318        # check this is a real rating token, it could just be the ": ", "," or whitespace
319        if primary_token_score is None:
320            return None
321
322        total_score = 0.0
323        total_probability = 0.0
324        top_logprobs_contains_primary_token = False
325
326        # Process all valid scoring tokens from alternatives
327        for top_logprob in token_logprob.top_logprobs:
328            if top_logprob.token == token_logprob.token:
329                top_logprobs_contains_primary_token = True
330            token_score = self.score_from_token_string(top_logprob.token)
331            if token_score is not None:
332                # Convert logprob to probability
333                probability = math.exp(top_logprob.logprob)
334                total_score += token_score * probability
335                total_probability += probability
336
337        # Weird OpenAI 4o bug - sometimes the primary token is included in the top logprobs, sometimes not.
338        # Add the primary token back in if excluded
339        if not top_logprobs_contains_primary_token:
340            if token_logprob.logprob == -9999.0:
341                # Another "bug" - sometimes the logprob is -9999.0. This seems to happen when the rest of the logprobs are tiny probability.
342                total_score += primary_token_score * 1.0
343                total_probability += 1.0
344            else:
345                probability = math.exp(token_logprob.logprob)
346                total_score += primary_token_score * probability
347                total_probability += probability
348
349        if total_probability <= 0.0:
350            raise RuntimeError(
351                f"No valid scoring tokens found for {token_logprob.token}. This should never happen as the token has a valid score (so it must be excluded from top logprobs). Please file a bug if you see this."
352            )
353
354        # Normalize by total probability of valid tokens (LLM may have wanted to generate other non-rating tokens, these shouldn't lower score of rating tokens)
355        weighted_score = total_score / total_probability
356
357        return weighted_score

Convert a rating token to a score using weighted average of top logprobs.

Only includes tokens that have valid scores.

Some cleanup for upper case, whitespace and quotes. LLMs aren't always consistent.

def score_from_token_string(self, token: str) -> float | None:
359    def score_from_token_string(self, token: str) -> float | None:
360        if token in TOKEN_TO_SCORE_MAP:
361            return TOKEN_TO_SCORE_MAP[token]
362
363        # handle more token variations like '"1"' and '"pass"' and ' paSS' and 'PASS'
364        unquoted_token = token.strip().strip('"').lower()
365        if unquoted_token in TOKEN_TO_SCORE_MAP:
366            return TOKEN_TO_SCORE_MAP[unquoted_token]
367
368        # handle numeric tokens like "1.0"
369        try:
370            float_value = float(token)
371            if float_value.is_integer():
372                str_token = str(int(float_value))
373                if str_token in TOKEN_TO_SCORE_MAP:
374                    return TOKEN_TO_SCORE_MAP[str_token]
375        except ValueError:
376            pass
377
378        return None
def metric_offsets(self, raw_output: str, metrics: List[str]) -> Dict[str, int]:
380    def metric_offsets(self, raw_output: str, metrics: List[str]) -> Dict[str, int]:
381        """
382        Find the offset to the start of each metric in the raw output json
383
384        For the example json: `{"overall_rating": 1}` == 1
385
386        should return:
387        {
388            "overall_rating": 1 # it's 1 character into the json string
389        }
390        """
391        metric_offsets: Dict[str, int] = {}
392        for metric in metrics:
393            # the quoted metric name is expected in the json: `{"overall_rating": 1}` == 1
394            metric_name = f'"{metric}"'
395
396            # we expect it exactly once
397            count = raw_output.count(metric_name)
398            if count != 1:
399                raise ValueError(
400                    f"Metric {metric} should appear exactly once in the output. Found {count} times"
401                )
402
403            offset = raw_output.find(metric_name)
404            if offset == -1:
405                raise ValueError(f"Metric {metric} not found in raw output")
406            metric_offsets[metric] = offset
407        return metric_offsets

Find the offset to the start of each metric in the raw output json

For the example json: {"overall_rating": 1} == 1

should return: { "overall_rating": 1 # it's 1 character into the json string }