kiln_ai.adapters.eval.g_eval

  1import math
  2from typing import Dict, List, Tuple
  3
  4from litellm.types.utils import ChatCompletionTokenLogprob
  5
  6from kiln_ai.adapters.adapter_registry import adapter_for_task
  7from kiln_ai.adapters.eval.base_eval import BaseEval
  8from kiln_ai.adapters.eval.eval_utils.eval_trace_formatter import EvalTraceFormatter
  9from kiln_ai.adapters.eval.eval_utils.eval_utils import EvalUtils
 10from kiln_ai.adapters.ml_model_list import (
 11    default_structured_output_mode_for_model_provider,
 12)
 13from kiln_ai.adapters.model_adapters.base_adapter import AdapterConfig, RunOutput
 14from kiln_ai.adapters.prompt_builders import PromptGenerators
 15from kiln_ai.datamodel import Project, Task, TaskRun
 16from kiln_ai.datamodel.eval import EvalConfig, EvalConfigType, EvalDataType, EvalScores
 17from kiln_ai.datamodel.task import RunConfigProperties, StructuredOutputMode
 18
 19# all the tokens we score for, and their float scores.
 20TOKEN_TO_SCORE_MAP: Dict[str, float] = {
 21    "1": 1.0,
 22    "2": 2.0,
 23    "3": 3.0,
 24    "4": 4.0,
 25    "5": 5.0,
 26    "pass": 1.0,
 27    "fail": 0.0,
 28    "critical": -1.0,
 29}
 30
 31
 32class GEvalTask(Task, parent_of={}):
 33    """
 34    Kiln task for executing a G-Eval. Can be run on any Kiln adapter which supports logprobs.
 35
 36    Note G-Eval implements both G-Eval and LLM as Judge as they are very similar.
 37    """
 38
 39    def __init__(self, eval_config: EvalConfig):
 40        tmp_project = Project(name="GEval")
 41
 42        # Build a simple LLM as Judge system instruction
 43        system_instruction = "Your job to evaluate a model's performance on a task. Blocks will be marked with <eval_data> tags.\n"
 44        # Optionally add a short task description
 45        task_description = eval_config.properties.get("task_description", None)
 46        if task_description:
 47            system_instruction += f"\nThe task the model was given is as follows:\n<eval_data>\n<task_description>{task_description}</task_description>\n</eval_data>\n"
 48
 49        # Build the COT eval instructions
 50        steps = eval_config.properties.get("eval_steps", [])
 51        if not isinstance(steps, list):
 52            raise ValueError("eval_steps must be a list.")
 53        if len(steps) == 1:
 54            cot_instructions = "First, think step by step about the model's performance following this evaluation step:\n\n"
 55            cot_instructions += f"{steps[0]}\n"
 56        else:
 57            cot_instructions = "First, think step by step about the model's performance following these evaluation steps:\n\n"
 58            for i, step in enumerate(steps):
 59                cot_instructions += f"{i + 1}) {step}\n"
 60
 61        eval = eval_config.parent_eval()
 62        if not eval:
 63            raise ValueError("Eval config must have a parent eval")
 64
 65        # Build the output schema from the eval's target output scores.
 66        # We restrict the LLM's output scoring schema to discrete scores (pass/fail/critical/1-5) - allow_float_scores=False
 67        # However, the final scores from the evaluator can be a float (see later logprob calculation, which requires discrete token outputs)
 68        output_schema = BaseEval.build_score_schema(eval, allow_float_scores=False)
 69
 70        super().__init__(
 71            name="GEval Task",
 72            parent=tmp_project,
 73            instruction=system_instruction,
 74            thinking_instruction=cot_instructions,
 75            output_json_schema=output_schema,
 76        )
 77
 78
 79class GEval(BaseEval):
 80    """
 81    A evaluator which implements G-Eval and LLM as Judge.
 82
 83    G-Eval is a method of evaluating the quality of a model's output. It is a weighted average of the scores of the tokens in the output. The weights are the log probabilities of the tokens in the output. https://arxiv.org/abs/2303.16634
 84
 85    LLM as Judge is a method of evaluating the quality of a model's output. It simply asks the LLM to score, and uses the returned output (no logprobs needed). Also called direct evaluation.
 86
 87    @misc{liu2023gevalnlgevaluationusing,
 88        title={G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment},
 89        author={Yang Liu and Dan Iter and Yichong Xu and Shuohang Wang and Ruochen Xu and Chenguang Zhu},
 90        year={2023},
 91        eprint={2303.16634},
 92        archivePrefix={arXiv},
 93        primaryClass={cs.CL},
 94        url={https://arxiv.org/abs/2303.16634},
 95    }
 96    """
 97
 98    def __init__(self, eval_config: EvalConfig, run_config: RunConfigProperties | None):
 99        if (
100            eval_config.config_type != EvalConfigType.g_eval
101            and eval_config.config_type != EvalConfigType.llm_as_judge
102        ):
103            raise ValueError(
104                f"GEval must be initialized with a GEval or LLM as Judge config_type. Got {eval_config.config_type}"
105            )
106
107        super().__init__(eval_config, run_config)
108
109        self.geval_task = GEvalTask(eval_config)
110
111    def generate_final_answer_run_description(
112        self, eval_input: str, eval_output: str
113    ) -> str:
114        return f"""The model was given the following input for the task: 
115<eval_data>
116{eval_input}
117</eval_data>
118
119The model produced the following output for the task:
120<eval_data>
121{eval_output}
122</eval_data>
123"""
124
125    def generate_ref_ans_run_description(
126        self, eval_input: str, eval_output: str, reference_answer: str
127    ) -> str:
128        return f"""The model was given the following input for the task: 
129<eval_data>
130{eval_input}
131</eval_data>
132
133The model produced the following output for the task:
134<eval_data>
135{eval_output}
136</eval_data>
137
138This is the reference answer:
139<eval_data>
140{reference_answer}
141</eval_data>
142"""
143
144    def generate_full_trace_run_description(
145        self,
146        eval_input: str,
147        available_tools: str | None,
148        conversation_history: str,
149    ) -> str:
150        description = ""
151        description += f"""The model was given the following <user_input> for the <task_description>: 
152<eval_data>
153<user_input>{eval_input}</user_input>
154</eval_data>
155"""
156        appropriate_tool_use_guidelines = str(
157            self.eval.template_properties.get("appropriate_tool_use_guidelines") or ""
158        )
159        description += """The model was given the following <appropriate_tool_use_guidelines> guidelines:"""
160        description += f""" 
161<eval_data>
162<appropriate_tool_use_guidelines>
163{appropriate_tool_use_guidelines}
164</appropriate_tool_use_guidelines>
165</eval_data>
166"""
167        inappropriate_tool_use_guidelines = str(
168            self.eval.template_properties.get("inappropriate_tool_use_guidelines") or ""
169        )
170        # Only include if it has content since it is optional
171        if inappropriate_tool_use_guidelines:
172            description += """The model was given the following <inappropriate_tool_use_guidelines> guidelines:"""
173            description += f""" 
174<eval_data>
175<inappropriate_tool_use_guidelines>
176{inappropriate_tool_use_guidelines}
177</inappropriate_tool_use_guidelines>
178</eval_data>
179"""
180
181        if available_tools is not None:
182            if available_tools != "":
183                description += f"""
184This is the list of tools available to the model:
185<eval_data>
186<available_tools>{available_tools}</available_tools>
187</eval_data>
188"""
189            else:
190                description += """
191There were no tools available to the model.
192"""
193
194        description += f"""
195This is the full conversation history for the task run:
196<eval_data>
197<conversation_history>{conversation_history}</conversation_history>
198</eval_data>
199"""
200        return description
201
202    async def run_eval(
203        self, task_run: TaskRun, eval_job_item: TaskRun | None = None
204    ) -> tuple[EvalScores, Dict[str, str] | None]:
205        """
206        Run this eval on the given task run.
207        """
208
209        model_name, provider = self.model_and_provider()
210
211        # Only fetch logprobs for G-Eval
212        # There are at most 5 valid rating tokens per rating type (five_star being largest), so 10 is more than enough to get to the very very unlikely
213        top_logprobs = (
214            10 if self.eval_config.config_type == EvalConfigType.g_eval else None
215        )
216
217        # We don't expose setting this manually in the UI, so pull a recommended mode from ml_model_list
218        structured_output_mode = default_structured_output_mode_for_model_provider(
219            model_name,
220            provider,
221            default=StructuredOutputMode.json_schema,
222            # G-eval expects JSON, so don't allow function calling modes
223            disallowed_modes=[
224                StructuredOutputMode.function_calling,
225                StructuredOutputMode.function_calling_weak,
226            ],
227        )
228
229        adapter = adapter_for_task(
230            self.geval_task,
231            run_config_properties=RunConfigProperties(
232                model_name=model_name,
233                model_provider_name=provider,
234                # We always use Simple COT for G-Eval and LLM as Judge
235                prompt_id=PromptGenerators.SIMPLE_CHAIN_OF_THOUGHT,
236                structured_output_mode=structured_output_mode,
237            ),
238            base_adapter_config=AdapterConfig(
239                # Don't save this run into the task_runs. It will be saved into an eval_run where it belongs
240                allow_saving=False,
241                top_logprobs=top_logprobs,
242            ),
243        )
244
245        if self.eval.evaluation_data_type == EvalDataType.full_trace:
246            if task_run.trace is None:
247                raise ValueError("Task run trace is required for full trace evaluation")
248
249            available_tools = await EvalUtils.formatted_available_tools_from_task_run(
250                task_run
251            )
252            run_description = self.generate_full_trace_run_description(
253                task_run.input,
254                available_tools,
255                EvalTraceFormatter.trace_to_formatted_conversation_history(
256                    task_run.trace
257                ),
258            )
259
260        elif self.eval.evaluation_data_type == EvalDataType.reference_answer:
261            if eval_job_item is None:
262                raise ValueError(
263                    "Eval job item is required for reference answer evaluation"
264                )
265            run_description = self.generate_ref_ans_run_description(
266                task_run.input, task_run.output.output, eval_job_item.output.output
267            )
268
269        else:  # EvalDataType.final_answer
270            run_description = self.generate_final_answer_run_description(
271                task_run.input, task_run.output.output
272            )
273
274        # We don't need the run, but invoke_returning_run_output() runs validations for us over _run()
275        _, run_output = await adapter.invoke_returning_run_output(run_description)
276
277        if self.eval_config.config_type == EvalConfigType.llm_as_judge:
278            return self.build_llm_as_judge_score(
279                run_output
280            ), run_output.intermediate_outputs
281        else:
282            return self.build_g_eval_score(run_output), run_output.intermediate_outputs
283
284    def build_llm_as_judge_score(self, run_output: RunOutput) -> EvalScores:
285        """
286        Build the LLM as Judge score for the given run and run output.
287        """
288        # Convert the output format we asked for (discreet values) to our float scores
289        scores: EvalScores = {}
290        if not isinstance(run_output.output, dict):
291            raise ValueError("LLM as Judge output must be a dictionary")
292
293        for metric, score in run_output.output.items():
294            token_score = self.score_from_token_string(f"{score}")
295            if token_score is None:
296                raise ValueError(
297                    f"No score found for metric: {metric}. The LLM failed to follow the scoring rubric/instructions/schema."
298                )
299            scores[metric] = token_score
300        return scores
301
302    def build_g_eval_score(self, run_output: RunOutput) -> EvalScores:
303        """
304        Build the G-Eval score for the given run and run output.
305
306        We create a weighted average of each rating using the logprobs.
307
308        @misc{liu2023gevalnlgevaluationusing,
309            title={G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment},
310            author={Yang Liu and Dan Iter and Yichong Xu and Shuohang Wang and Ruochen Xu and Chenguang Zhu},
311            year={2023},
312            eprint={2303.16634},
313            archivePrefix={arXiv},
314            primaryClass={cs.CL},
315            url={https://arxiv.org/abs/2303.16634},
316        }
317        """
318        # We use structured output
319        outputs = run_output.output
320        assert isinstance(outputs, dict)
321
322        # Build raw string output from the logprobs, which is easier to work with than Dict for the next bit
323        raw_output = self.raw_output_from_logprobs(run_output)
324
325        # find the offset the start of each metric in the raw output json
326        metrics: List[str] = list(outputs.keys())
327        metric_offsets = self.metric_offsets(raw_output, metrics)
328
329        final_scores: EvalScores = {}
330        for metric in metrics:
331            score = self.g_eval_single_metric(
332                run_output, metric, metric_offsets, raw_output
333            )
334            if score is None:
335                raise ValueError(
336                    f"No score found for metric: {metric}. The LLM failed to follow the scoring rubric/instructions/schema."
337                )
338            final_scores[metric] = score
339
340        return final_scores
341
342    def g_eval_single_metric(
343        self,
344        run_output: RunOutput,
345        metric: str,
346        metric_offsets: Dict[str, int],
347        raw_output: str,
348    ) -> float | None:
349        """
350        Run the G-Eval for a single metric.
351
352        Scan the logprobs for the metric and return the weighted score of the rating token.
353        """
354
355        start_offset, end_offset = self.token_search_range(
356            raw_output, metric, metric_offsets
357        )
358
359        offset = 0
360
361        if (
362            run_output.output_logprobs is None
363            or run_output.output_logprobs.content is None
364        ):
365            raise RuntimeError(
366                "No logprobs found for output - can not calculate g-eval"
367            )
368
369        # scan the tokens in the range, looking for the rating token
370        for _, chat_logprob in enumerate(run_output.output_logprobs.content):
371            if offset >= end_offset:
372                break
373            if offset >= start_offset:
374                score = self.rating_token_to_score(chat_logprob)
375                if score is not None:
376                    return score
377            offset += len(chat_logprob.token)
378
379        return None
380
381    def raw_output_from_logprobs(self, run_output: RunOutput) -> str:
382        """
383        Build the raw output string from the logprobs. Generate from logprobs so it's guaranteed to match the logprobs offsets
384        """
385        if (
386            run_output.output_logprobs is None
387            or run_output.output_logprobs.content is None
388        ):
389            raise RuntimeError(
390                "No logprobs found for output - can not calculate g-eval"
391            )
392
393        raw = ""
394        for chat_logprob in run_output.output_logprobs.content:
395            raw += chat_logprob.token
396        return raw
397
398    def token_search_range(
399        self, raw_output: str, metric: str, metric_offsets: Dict[str, int]
400    ) -> Tuple[int, int]:
401        """
402        Find the start and end offsets of the metric in the raw output.
403
404        Start searching after the end of the target metric json entry ("overall_rating":), and before the start of the next metric ("some_other_score").
405        """
406        start_offset = metric_offsets[metric] + len(metric)
407
408        # Find the lowest end offset that is greater than the start offset
409        end_offset = len(raw_output)
410        for v in list(metric_offsets.values()):
411            if v < end_offset and v > start_offset:
412                end_offset = v
413
414        return start_offset, end_offset
415
416    def rating_token_to_score(
417        self, token_logprob: ChatCompletionTokenLogprob
418    ) -> float | None:
419        """
420        Convert a rating token to a score using weighted average of top logprobs.
421
422        Only includes tokens that have valid scores.
423
424        Some cleanup for upper case, whitespace and quotes. LLMs aren't always consistent.
425        """
426        primary_token_score = self.score_from_token_string(token_logprob.token)
427        # check this is a real rating token, it could just be the ": ", "," or whitespace
428        if primary_token_score is None:
429            return None
430
431        total_score = 0.0
432        total_probability = 0.0
433        top_logprobs_contains_primary_token = False
434
435        # Process all valid scoring tokens from alternatives
436        for top_logprob in token_logprob.top_logprobs:
437            if top_logprob.token == token_logprob.token:
438                top_logprobs_contains_primary_token = True
439            token_score = self.score_from_token_string(top_logprob.token)
440            if token_score is not None:
441                # Convert logprob to probability
442                probability = math.exp(top_logprob.logprob)
443                total_score += token_score * probability
444                total_probability += probability
445
446        # Weird OpenAI 4o bug - sometimes the primary token is included in the top logprobs, sometimes not.
447        # Add the primary token back in if excluded
448        if not top_logprobs_contains_primary_token:
449            if token_logprob.logprob == -9999.0:
450                # Another "bug" - sometimes the logprob is -9999.0. This seems to happen when the rest of the logprobs are tiny probability.
451                total_score += primary_token_score * 1.0
452                total_probability += 1.0
453            else:
454                probability = math.exp(token_logprob.logprob)
455                total_score += primary_token_score * probability
456                total_probability += probability
457
458        if total_probability <= 0.0:
459            raise RuntimeError(
460                f"No valid scoring tokens found for {token_logprob.token}. This should never happen as the token has a valid score (so it must be excluded from top logprobs). Please file a bug if you see this."
461            )
462
463        # Normalize by total probability of valid tokens (LLM may have wanted to generate other non-rating tokens, these shouldn't lower score of rating tokens)
464        weighted_score = total_score / total_probability
465
466        return weighted_score
467
468    def score_from_token_string(self, token: str) -> float | None:
469        if token in TOKEN_TO_SCORE_MAP:
470            return TOKEN_TO_SCORE_MAP[token]
471
472        # handle more token variations like '"1"' and '"pass"' and ' paSS' and 'PASS'
473        unquoted_token = token.strip().strip('"').lower()
474        if unquoted_token in TOKEN_TO_SCORE_MAP:
475            return TOKEN_TO_SCORE_MAP[unquoted_token]
476
477        # handle numeric tokens like "1.0"
478        try:
479            float_value = float(token)
480            if float_value.is_integer():
481                str_token = str(int(float_value))
482                if str_token in TOKEN_TO_SCORE_MAP:
483                    return TOKEN_TO_SCORE_MAP[str_token]
484        except ValueError:
485            pass
486
487        return None
488
489    def metric_offsets(self, raw_output: str, metrics: List[str]) -> Dict[str, int]:
490        """
491        Find the offset to the start of each metric in the raw output json
492
493        For the example json: `{"overall_rating": 1}` == 1
494
495        should return:
496        {
497            "overall_rating": 1 # it's 1 character into the json string
498        }
499        """
500        metric_offsets: Dict[str, int] = {}
501        for metric in metrics:
502            # the quoted metric name is expected in the json: `{"overall_rating": 1}` == 1
503            metric_name = f'"{metric}"'
504
505            # we expect it exactly once
506            count = raw_output.count(metric_name)
507            if count != 1:
508                raise ValueError(
509                    f"Metric {metric} should appear exactly once in the output. Found {count} times"
510                )
511
512            offset = raw_output.find(metric_name)
513            if offset == -1:
514                raise ValueError(f"Metric {metric} not found in raw output")
515            metric_offsets[metric] = offset
516        return metric_offsets
TOKEN_TO_SCORE_MAP: Dict[str, float] = {'1': 1.0, '2': 2.0, '3': 3.0, '4': 4.0, '5': 5.0, 'pass': 1.0, 'fail': 0.0, 'critical': -1.0}
class GEvalTask(kiln_ai.datamodel.task.Task):
33class GEvalTask(Task, parent_of={}):
34    """
35    Kiln task for executing a G-Eval. Can be run on any Kiln adapter which supports logprobs.
36
37    Note G-Eval implements both G-Eval and LLM as Judge as they are very similar.
38    """
39
40    def __init__(self, eval_config: EvalConfig):
41        tmp_project = Project(name="GEval")
42
43        # Build a simple LLM as Judge system instruction
44        system_instruction = "Your job to evaluate a model's performance on a task. Blocks will be marked with <eval_data> tags.\n"
45        # Optionally add a short task description
46        task_description = eval_config.properties.get("task_description", None)
47        if task_description:
48            system_instruction += f"\nThe task the model was given is as follows:\n<eval_data>\n<task_description>{task_description}</task_description>\n</eval_data>\n"
49
50        # Build the COT eval instructions
51        steps = eval_config.properties.get("eval_steps", [])
52        if not isinstance(steps, list):
53            raise ValueError("eval_steps must be a list.")
54        if len(steps) == 1:
55            cot_instructions = "First, think step by step about the model's performance following this evaluation step:\n\n"
56            cot_instructions += f"{steps[0]}\n"
57        else:
58            cot_instructions = "First, think step by step about the model's performance following these evaluation steps:\n\n"
59            for i, step in enumerate(steps):
60                cot_instructions += f"{i + 1}) {step}\n"
61
62        eval = eval_config.parent_eval()
63        if not eval:
64            raise ValueError("Eval config must have a parent eval")
65
66        # Build the output schema from the eval's target output scores.
67        # We restrict the LLM's output scoring schema to discrete scores (pass/fail/critical/1-5) - allow_float_scores=False
68        # However, the final scores from the evaluator can be a float (see later logprob calculation, which requires discrete token outputs)
69        output_schema = BaseEval.build_score_schema(eval, allow_float_scores=False)
70
71        super().__init__(
72            name="GEval Task",
73            parent=tmp_project,
74            instruction=system_instruction,
75            thinking_instruction=cot_instructions,
76            output_json_schema=output_schema,
77        )

Kiln task for executing a G-Eval. Can be run on any Kiln adapter which supports logprobs.

Note G-Eval implements both G-Eval and LLM as Judge as they are very similar.

GEvalTask(eval_config: kiln_ai.datamodel.eval.EvalConfig)
40    def __init__(self, eval_config: EvalConfig):
41        tmp_project = Project(name="GEval")
42
43        # Build a simple LLM as Judge system instruction
44        system_instruction = "Your job to evaluate a model's performance on a task. Blocks will be marked with <eval_data> tags.\n"
45        # Optionally add a short task description
46        task_description = eval_config.properties.get("task_description", None)
47        if task_description:
48            system_instruction += f"\nThe task the model was given is as follows:\n<eval_data>\n<task_description>{task_description}</task_description>\n</eval_data>\n"
49
50        # Build the COT eval instructions
51        steps = eval_config.properties.get("eval_steps", [])
52        if not isinstance(steps, list):
53            raise ValueError("eval_steps must be a list.")
54        if len(steps) == 1:
55            cot_instructions = "First, think step by step about the model's performance following this evaluation step:\n\n"
56            cot_instructions += f"{steps[0]}\n"
57        else:
58            cot_instructions = "First, think step by step about the model's performance following these evaluation steps:\n\n"
59            for i, step in enumerate(steps):
60                cot_instructions += f"{i + 1}) {step}\n"
61
62        eval = eval_config.parent_eval()
63        if not eval:
64            raise ValueError("Eval config must have a parent eval")
65
66        # Build the output schema from the eval's target output scores.
67        # We restrict the LLM's output scoring schema to discrete scores (pass/fail/critical/1-5) - allow_float_scores=False
68        # However, the final scores from the evaluator can be a float (see later logprob calculation, which requires discrete token outputs)
69        output_schema = BaseEval.build_score_schema(eval, allow_float_scores=False)
70
71        super().__init__(
72            name="GEval Task",
73            parent=tmp_project,
74            instruction=system_instruction,
75            thinking_instruction=cot_instructions,
76            output_json_schema=output_schema,
77        )

Create a new model by parsing and validating input data from keyword arguments.

Raises [ValidationError][pydantic_core.ValidationError] if the input data cannot be validated to form a valid model.

self is explicitly positional-only to allow self as a field name.

model_config = {'validate_assignment': True}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

def model_post_init(self: pydantic.main.BaseModel, context: Any, /) -> None:
337def init_private_attributes(self: BaseModel, context: Any, /) -> None:
338    """This function is meant to behave like a BaseModel method to initialise private attributes.
339
340    It takes context as an argument since that's what pydantic-core passes when calling it.
341
342    Args:
343        self: The BaseModel instance.
344        context: The context.
345    """
346    if getattr(self, '__pydantic_private__', None) is None:
347        pydantic_private = {}
348        for name, private_attr in self.__private_attributes__.items():
349            default = private_attr.get_default()
350            if default is not PydanticUndefined:
351                pydantic_private[name] = default
352        object_setattr(self, '__pydantic_private__', pydantic_private)

This function is meant to behave like a BaseModel method to initialise private attributes.

It takes context as an argument since that's what pydantic-core passes when calling it.

Args: self: The BaseModel instance. context: The context.

class GEval(kiln_ai.adapters.eval.base_eval.BaseEval):
 80class GEval(BaseEval):
 81    """
 82    A evaluator which implements G-Eval and LLM as Judge.
 83
 84    G-Eval is a method of evaluating the quality of a model's output. It is a weighted average of the scores of the tokens in the output. The weights are the log probabilities of the tokens in the output. https://arxiv.org/abs/2303.16634
 85
 86    LLM as Judge is a method of evaluating the quality of a model's output. It simply asks the LLM to score, and uses the returned output (no logprobs needed). Also called direct evaluation.
 87
 88    @misc{liu2023gevalnlgevaluationusing,
 89        title={G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment},
 90        author={Yang Liu and Dan Iter and Yichong Xu and Shuohang Wang and Ruochen Xu and Chenguang Zhu},
 91        year={2023},
 92        eprint={2303.16634},
 93        archivePrefix={arXiv},
 94        primaryClass={cs.CL},
 95        url={https://arxiv.org/abs/2303.16634},
 96    }
 97    """
 98
 99    def __init__(self, eval_config: EvalConfig, run_config: RunConfigProperties | None):
100        if (
101            eval_config.config_type != EvalConfigType.g_eval
102            and eval_config.config_type != EvalConfigType.llm_as_judge
103        ):
104            raise ValueError(
105                f"GEval must be initialized with a GEval or LLM as Judge config_type. Got {eval_config.config_type}"
106            )
107
108        super().__init__(eval_config, run_config)
109
110        self.geval_task = GEvalTask(eval_config)
111
112    def generate_final_answer_run_description(
113        self, eval_input: str, eval_output: str
114    ) -> str:
115        return f"""The model was given the following input for the task: 
116<eval_data>
117{eval_input}
118</eval_data>
119
120The model produced the following output for the task:
121<eval_data>
122{eval_output}
123</eval_data>
124"""
125
126    def generate_ref_ans_run_description(
127        self, eval_input: str, eval_output: str, reference_answer: str
128    ) -> str:
129        return f"""The model was given the following input for the task: 
130<eval_data>
131{eval_input}
132</eval_data>
133
134The model produced the following output for the task:
135<eval_data>
136{eval_output}
137</eval_data>
138
139This is the reference answer:
140<eval_data>
141{reference_answer}
142</eval_data>
143"""
144
145    def generate_full_trace_run_description(
146        self,
147        eval_input: str,
148        available_tools: str | None,
149        conversation_history: str,
150    ) -> str:
151        description = ""
152        description += f"""The model was given the following <user_input> for the <task_description>: 
153<eval_data>
154<user_input>{eval_input}</user_input>
155</eval_data>
156"""
157        appropriate_tool_use_guidelines = str(
158            self.eval.template_properties.get("appropriate_tool_use_guidelines") or ""
159        )
160        description += """The model was given the following <appropriate_tool_use_guidelines> guidelines:"""
161        description += f""" 
162<eval_data>
163<appropriate_tool_use_guidelines>
164{appropriate_tool_use_guidelines}
165</appropriate_tool_use_guidelines>
166</eval_data>
167"""
168        inappropriate_tool_use_guidelines = str(
169            self.eval.template_properties.get("inappropriate_tool_use_guidelines") or ""
170        )
171        # Only include if it has content since it is optional
172        if inappropriate_tool_use_guidelines:
173            description += """The model was given the following <inappropriate_tool_use_guidelines> guidelines:"""
174            description += f""" 
175<eval_data>
176<inappropriate_tool_use_guidelines>
177{inappropriate_tool_use_guidelines}
178</inappropriate_tool_use_guidelines>
179</eval_data>
180"""
181
182        if available_tools is not None:
183            if available_tools != "":
184                description += f"""
185This is the list of tools available to the model:
186<eval_data>
187<available_tools>{available_tools}</available_tools>
188</eval_data>
189"""
190            else:
191                description += """
192There were no tools available to the model.
193"""
194
195        description += f"""
196This is the full conversation history for the task run:
197<eval_data>
198<conversation_history>{conversation_history}</conversation_history>
199</eval_data>
200"""
201        return description
202
203    async def run_eval(
204        self, task_run: TaskRun, eval_job_item: TaskRun | None = None
205    ) -> tuple[EvalScores, Dict[str, str] | None]:
206        """
207        Run this eval on the given task run.
208        """
209
210        model_name, provider = self.model_and_provider()
211
212        # Only fetch logprobs for G-Eval
213        # There are at most 5 valid rating tokens per rating type (five_star being largest), so 10 is more than enough to get to the very very unlikely
214        top_logprobs = (
215            10 if self.eval_config.config_type == EvalConfigType.g_eval else None
216        )
217
218        # We don't expose setting this manually in the UI, so pull a recommended mode from ml_model_list
219        structured_output_mode = default_structured_output_mode_for_model_provider(
220            model_name,
221            provider,
222            default=StructuredOutputMode.json_schema,
223            # G-eval expects JSON, so don't allow function calling modes
224            disallowed_modes=[
225                StructuredOutputMode.function_calling,
226                StructuredOutputMode.function_calling_weak,
227            ],
228        )
229
230        adapter = adapter_for_task(
231            self.geval_task,
232            run_config_properties=RunConfigProperties(
233                model_name=model_name,
234                model_provider_name=provider,
235                # We always use Simple COT for G-Eval and LLM as Judge
236                prompt_id=PromptGenerators.SIMPLE_CHAIN_OF_THOUGHT,
237                structured_output_mode=structured_output_mode,
238            ),
239            base_adapter_config=AdapterConfig(
240                # Don't save this run into the task_runs. It will be saved into an eval_run where it belongs
241                allow_saving=False,
242                top_logprobs=top_logprobs,
243            ),
244        )
245
246        if self.eval.evaluation_data_type == EvalDataType.full_trace:
247            if task_run.trace is None:
248                raise ValueError("Task run trace is required for full trace evaluation")
249
250            available_tools = await EvalUtils.formatted_available_tools_from_task_run(
251                task_run
252            )
253            run_description = self.generate_full_trace_run_description(
254                task_run.input,
255                available_tools,
256                EvalTraceFormatter.trace_to_formatted_conversation_history(
257                    task_run.trace
258                ),
259            )
260
261        elif self.eval.evaluation_data_type == EvalDataType.reference_answer:
262            if eval_job_item is None:
263                raise ValueError(
264                    "Eval job item is required for reference answer evaluation"
265                )
266            run_description = self.generate_ref_ans_run_description(
267                task_run.input, task_run.output.output, eval_job_item.output.output
268            )
269
270        else:  # EvalDataType.final_answer
271            run_description = self.generate_final_answer_run_description(
272                task_run.input, task_run.output.output
273            )
274
275        # We don't need the run, but invoke_returning_run_output() runs validations for us over _run()
276        _, run_output = await adapter.invoke_returning_run_output(run_description)
277
278        if self.eval_config.config_type == EvalConfigType.llm_as_judge:
279            return self.build_llm_as_judge_score(
280                run_output
281            ), run_output.intermediate_outputs
282        else:
283            return self.build_g_eval_score(run_output), run_output.intermediate_outputs
284
285    def build_llm_as_judge_score(self, run_output: RunOutput) -> EvalScores:
286        """
287        Build the LLM as Judge score for the given run and run output.
288        """
289        # Convert the output format we asked for (discreet values) to our float scores
290        scores: EvalScores = {}
291        if not isinstance(run_output.output, dict):
292            raise ValueError("LLM as Judge output must be a dictionary")
293
294        for metric, score in run_output.output.items():
295            token_score = self.score_from_token_string(f"{score}")
296            if token_score is None:
297                raise ValueError(
298                    f"No score found for metric: {metric}. The LLM failed to follow the scoring rubric/instructions/schema."
299                )
300            scores[metric] = token_score
301        return scores
302
303    def build_g_eval_score(self, run_output: RunOutput) -> EvalScores:
304        """
305        Build the G-Eval score for the given run and run output.
306
307        We create a weighted average of each rating using the logprobs.
308
309        @misc{liu2023gevalnlgevaluationusing,
310            title={G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment},
311            author={Yang Liu and Dan Iter and Yichong Xu and Shuohang Wang and Ruochen Xu and Chenguang Zhu},
312            year={2023},
313            eprint={2303.16634},
314            archivePrefix={arXiv},
315            primaryClass={cs.CL},
316            url={https://arxiv.org/abs/2303.16634},
317        }
318        """
319        # We use structured output
320        outputs = run_output.output
321        assert isinstance(outputs, dict)
322
323        # Build raw string output from the logprobs, which is easier to work with than Dict for the next bit
324        raw_output = self.raw_output_from_logprobs(run_output)
325
326        # find the offset the start of each metric in the raw output json
327        metrics: List[str] = list(outputs.keys())
328        metric_offsets = self.metric_offsets(raw_output, metrics)
329
330        final_scores: EvalScores = {}
331        for metric in metrics:
332            score = self.g_eval_single_metric(
333                run_output, metric, metric_offsets, raw_output
334            )
335            if score is None:
336                raise ValueError(
337                    f"No score found for metric: {metric}. The LLM failed to follow the scoring rubric/instructions/schema."
338                )
339            final_scores[metric] = score
340
341        return final_scores
342
343    def g_eval_single_metric(
344        self,
345        run_output: RunOutput,
346        metric: str,
347        metric_offsets: Dict[str, int],
348        raw_output: str,
349    ) -> float | None:
350        """
351        Run the G-Eval for a single metric.
352
353        Scan the logprobs for the metric and return the weighted score of the rating token.
354        """
355
356        start_offset, end_offset = self.token_search_range(
357            raw_output, metric, metric_offsets
358        )
359
360        offset = 0
361
362        if (
363            run_output.output_logprobs is None
364            or run_output.output_logprobs.content is None
365        ):
366            raise RuntimeError(
367                "No logprobs found for output - can not calculate g-eval"
368            )
369
370        # scan the tokens in the range, looking for the rating token
371        for _, chat_logprob in enumerate(run_output.output_logprobs.content):
372            if offset >= end_offset:
373                break
374            if offset >= start_offset:
375                score = self.rating_token_to_score(chat_logprob)
376                if score is not None:
377                    return score
378            offset += len(chat_logprob.token)
379
380        return None
381
382    def raw_output_from_logprobs(self, run_output: RunOutput) -> str:
383        """
384        Build the raw output string from the logprobs. Generate from logprobs so it's guaranteed to match the logprobs offsets
385        """
386        if (
387            run_output.output_logprobs is None
388            or run_output.output_logprobs.content is None
389        ):
390            raise RuntimeError(
391                "No logprobs found for output - can not calculate g-eval"
392            )
393
394        raw = ""
395        for chat_logprob in run_output.output_logprobs.content:
396            raw += chat_logprob.token
397        return raw
398
399    def token_search_range(
400        self, raw_output: str, metric: str, metric_offsets: Dict[str, int]
401    ) -> Tuple[int, int]:
402        """
403        Find the start and end offsets of the metric in the raw output.
404
405        Start searching after the end of the target metric json entry ("overall_rating":), and before the start of the next metric ("some_other_score").
406        """
407        start_offset = metric_offsets[metric] + len(metric)
408
409        # Find the lowest end offset that is greater than the start offset
410        end_offset = len(raw_output)
411        for v in list(metric_offsets.values()):
412            if v < end_offset and v > start_offset:
413                end_offset = v
414
415        return start_offset, end_offset
416
417    def rating_token_to_score(
418        self, token_logprob: ChatCompletionTokenLogprob
419    ) -> float | None:
420        """
421        Convert a rating token to a score using weighted average of top logprobs.
422
423        Only includes tokens that have valid scores.
424
425        Some cleanup for upper case, whitespace and quotes. LLMs aren't always consistent.
426        """
427        primary_token_score = self.score_from_token_string(token_logprob.token)
428        # check this is a real rating token, it could just be the ": ", "," or whitespace
429        if primary_token_score is None:
430            return None
431
432        total_score = 0.0
433        total_probability = 0.0
434        top_logprobs_contains_primary_token = False
435
436        # Process all valid scoring tokens from alternatives
437        for top_logprob in token_logprob.top_logprobs:
438            if top_logprob.token == token_logprob.token:
439                top_logprobs_contains_primary_token = True
440            token_score = self.score_from_token_string(top_logprob.token)
441            if token_score is not None:
442                # Convert logprob to probability
443                probability = math.exp(top_logprob.logprob)
444                total_score += token_score * probability
445                total_probability += probability
446
447        # Weird OpenAI 4o bug - sometimes the primary token is included in the top logprobs, sometimes not.
448        # Add the primary token back in if excluded
449        if not top_logprobs_contains_primary_token:
450            if token_logprob.logprob == -9999.0:
451                # Another "bug" - sometimes the logprob is -9999.0. This seems to happen when the rest of the logprobs are tiny probability.
452                total_score += primary_token_score * 1.0
453                total_probability += 1.0
454            else:
455                probability = math.exp(token_logprob.logprob)
456                total_score += primary_token_score * probability
457                total_probability += probability
458
459        if total_probability <= 0.0:
460            raise RuntimeError(
461                f"No valid scoring tokens found for {token_logprob.token}. This should never happen as the token has a valid score (so it must be excluded from top logprobs). Please file a bug if you see this."
462            )
463
464        # Normalize by total probability of valid tokens (LLM may have wanted to generate other non-rating tokens, these shouldn't lower score of rating tokens)
465        weighted_score = total_score / total_probability
466
467        return weighted_score
468
469    def score_from_token_string(self, token: str) -> float | None:
470        if token in TOKEN_TO_SCORE_MAP:
471            return TOKEN_TO_SCORE_MAP[token]
472
473        # handle more token variations like '"1"' and '"pass"' and ' paSS' and 'PASS'
474        unquoted_token = token.strip().strip('"').lower()
475        if unquoted_token in TOKEN_TO_SCORE_MAP:
476            return TOKEN_TO_SCORE_MAP[unquoted_token]
477
478        # handle numeric tokens like "1.0"
479        try:
480            float_value = float(token)
481            if float_value.is_integer():
482                str_token = str(int(float_value))
483                if str_token in TOKEN_TO_SCORE_MAP:
484                    return TOKEN_TO_SCORE_MAP[str_token]
485        except ValueError:
486            pass
487
488        return None
489
490    def metric_offsets(self, raw_output: str, metrics: List[str]) -> Dict[str, int]:
491        """
492        Find the offset to the start of each metric in the raw output json
493
494        For the example json: `{"overall_rating": 1}` == 1
495
496        should return:
497        {
498            "overall_rating": 1 # it's 1 character into the json string
499        }
500        """
501        metric_offsets: Dict[str, int] = {}
502        for metric in metrics:
503            # the quoted metric name is expected in the json: `{"overall_rating": 1}` == 1
504            metric_name = f'"{metric}"'
505
506            # we expect it exactly once
507            count = raw_output.count(metric_name)
508            if count != 1:
509                raise ValueError(
510                    f"Metric {metric} should appear exactly once in the output. Found {count} times"
511                )
512
513            offset = raw_output.find(metric_name)
514            if offset == -1:
515                raise ValueError(f"Metric {metric} not found in raw output")
516            metric_offsets[metric] = offset
517        return metric_offsets

A evaluator which implements G-Eval and LLM as Judge.

G-Eval is a method of evaluating the quality of a model's output. It is a weighted average of the scores of the tokens in the output. The weights are the log probabilities of the tokens in the output. https://arxiv.org/abs/2303.16634

LLM as Judge is a method of evaluating the quality of a model's output. It simply asks the LLM to score, and uses the returned output (no logprobs needed). Also called direct evaluation.

@misc{liu2023gevalnlgevaluationusing, title={G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment}, author={Yang Liu and Dan Iter and Yichong Xu and Shuohang Wang and Ruochen Xu and Chenguang Zhu}, year={2023}, eprint={2303.16634}, archivePrefix={arXiv}, primaryClass={cs.CL}, url={https://arxiv.org/abs/2303.16634}, }

GEval( eval_config: kiln_ai.datamodel.eval.EvalConfig, run_config: kiln_ai.datamodel.run_config.RunConfigProperties | None)
 99    def __init__(self, eval_config: EvalConfig, run_config: RunConfigProperties | None):
100        if (
101            eval_config.config_type != EvalConfigType.g_eval
102            and eval_config.config_type != EvalConfigType.llm_as_judge
103        ):
104            raise ValueError(
105                f"GEval must be initialized with a GEval or LLM as Judge config_type. Got {eval_config.config_type}"
106            )
107
108        super().__init__(eval_config, run_config)
109
110        self.geval_task = GEvalTask(eval_config)
geval_task
def generate_final_answer_run_description(self, eval_input: str, eval_output: str) -> str:
112    def generate_final_answer_run_description(
113        self, eval_input: str, eval_output: str
114    ) -> str:
115        return f"""The model was given the following input for the task: 
116<eval_data>
117{eval_input}
118</eval_data>
119
120The model produced the following output for the task:
121<eval_data>
122{eval_output}
123</eval_data>
124"""
def generate_ref_ans_run_description(self, eval_input: str, eval_output: str, reference_answer: str) -> str:
126    def generate_ref_ans_run_description(
127        self, eval_input: str, eval_output: str, reference_answer: str
128    ) -> str:
129        return f"""The model was given the following input for the task: 
130<eval_data>
131{eval_input}
132</eval_data>
133
134The model produced the following output for the task:
135<eval_data>
136{eval_output}
137</eval_data>
138
139This is the reference answer:
140<eval_data>
141{reference_answer}
142</eval_data>
143"""
def generate_full_trace_run_description( self, eval_input: str, available_tools: str | None, conversation_history: str) -> str:
145    def generate_full_trace_run_description(
146        self,
147        eval_input: str,
148        available_tools: str | None,
149        conversation_history: str,
150    ) -> str:
151        description = ""
152        description += f"""The model was given the following <user_input> for the <task_description>: 
153<eval_data>
154<user_input>{eval_input}</user_input>
155</eval_data>
156"""
157        appropriate_tool_use_guidelines = str(
158            self.eval.template_properties.get("appropriate_tool_use_guidelines") or ""
159        )
160        description += """The model was given the following <appropriate_tool_use_guidelines> guidelines:"""
161        description += f""" 
162<eval_data>
163<appropriate_tool_use_guidelines>
164{appropriate_tool_use_guidelines}
165</appropriate_tool_use_guidelines>
166</eval_data>
167"""
168        inappropriate_tool_use_guidelines = str(
169            self.eval.template_properties.get("inappropriate_tool_use_guidelines") or ""
170        )
171        # Only include if it has content since it is optional
172        if inappropriate_tool_use_guidelines:
173            description += """The model was given the following <inappropriate_tool_use_guidelines> guidelines:"""
174            description += f""" 
175<eval_data>
176<inappropriate_tool_use_guidelines>
177{inappropriate_tool_use_guidelines}
178</inappropriate_tool_use_guidelines>
179</eval_data>
180"""
181
182        if available_tools is not None:
183            if available_tools != "":
184                description += f"""
185This is the list of tools available to the model:
186<eval_data>
187<available_tools>{available_tools}</available_tools>
188</eval_data>
189"""
190            else:
191                description += """
192There were no tools available to the model.
193"""
194
195        description += f"""
196This is the full conversation history for the task run:
197<eval_data>
198<conversation_history>{conversation_history}</conversation_history>
199</eval_data>
200"""
201        return description
async def run_eval( self, task_run: kiln_ai.datamodel.TaskRun, eval_job_item: kiln_ai.datamodel.TaskRun | None = None) -> tuple[typing.Dict[str, float], typing.Optional[typing.Dict[str, str]]]:
203    async def run_eval(
204        self, task_run: TaskRun, eval_job_item: TaskRun | None = None
205    ) -> tuple[EvalScores, Dict[str, str] | None]:
206        """
207        Run this eval on the given task run.
208        """
209
210        model_name, provider = self.model_and_provider()
211
212        # Only fetch logprobs for G-Eval
213        # There are at most 5 valid rating tokens per rating type (five_star being largest), so 10 is more than enough to get to the very very unlikely
214        top_logprobs = (
215            10 if self.eval_config.config_type == EvalConfigType.g_eval else None
216        )
217
218        # We don't expose setting this manually in the UI, so pull a recommended mode from ml_model_list
219        structured_output_mode = default_structured_output_mode_for_model_provider(
220            model_name,
221            provider,
222            default=StructuredOutputMode.json_schema,
223            # G-eval expects JSON, so don't allow function calling modes
224            disallowed_modes=[
225                StructuredOutputMode.function_calling,
226                StructuredOutputMode.function_calling_weak,
227            ],
228        )
229
230        adapter = adapter_for_task(
231            self.geval_task,
232            run_config_properties=RunConfigProperties(
233                model_name=model_name,
234                model_provider_name=provider,
235                # We always use Simple COT for G-Eval and LLM as Judge
236                prompt_id=PromptGenerators.SIMPLE_CHAIN_OF_THOUGHT,
237                structured_output_mode=structured_output_mode,
238            ),
239            base_adapter_config=AdapterConfig(
240                # Don't save this run into the task_runs. It will be saved into an eval_run where it belongs
241                allow_saving=False,
242                top_logprobs=top_logprobs,
243            ),
244        )
245
246        if self.eval.evaluation_data_type == EvalDataType.full_trace:
247            if task_run.trace is None:
248                raise ValueError("Task run trace is required for full trace evaluation")
249
250            available_tools = await EvalUtils.formatted_available_tools_from_task_run(
251                task_run
252            )
253            run_description = self.generate_full_trace_run_description(
254                task_run.input,
255                available_tools,
256                EvalTraceFormatter.trace_to_formatted_conversation_history(
257                    task_run.trace
258                ),
259            )
260
261        elif self.eval.evaluation_data_type == EvalDataType.reference_answer:
262            if eval_job_item is None:
263                raise ValueError(
264                    "Eval job item is required for reference answer evaluation"
265                )
266            run_description = self.generate_ref_ans_run_description(
267                task_run.input, task_run.output.output, eval_job_item.output.output
268            )
269
270        else:  # EvalDataType.final_answer
271            run_description = self.generate_final_answer_run_description(
272                task_run.input, task_run.output.output
273            )
274
275        # We don't need the run, but invoke_returning_run_output() runs validations for us over _run()
276        _, run_output = await adapter.invoke_returning_run_output(run_description)
277
278        if self.eval_config.config_type == EvalConfigType.llm_as_judge:
279            return self.build_llm_as_judge_score(
280                run_output
281            ), run_output.intermediate_outputs
282        else:
283            return self.build_g_eval_score(run_output), run_output.intermediate_outputs

Run this eval on the given task run.

def build_llm_as_judge_score( self, run_output: kiln_ai.adapters.run_output.RunOutput) -> Dict[str, float]:
285    def build_llm_as_judge_score(self, run_output: RunOutput) -> EvalScores:
286        """
287        Build the LLM as Judge score for the given run and run output.
288        """
289        # Convert the output format we asked for (discreet values) to our float scores
290        scores: EvalScores = {}
291        if not isinstance(run_output.output, dict):
292            raise ValueError("LLM as Judge output must be a dictionary")
293
294        for metric, score in run_output.output.items():
295            token_score = self.score_from_token_string(f"{score}")
296            if token_score is None:
297                raise ValueError(
298                    f"No score found for metric: {metric}. The LLM failed to follow the scoring rubric/instructions/schema."
299                )
300            scores[metric] = token_score
301        return scores

Build the LLM as Judge score for the given run and run output.

def build_g_eval_score( self, run_output: kiln_ai.adapters.run_output.RunOutput) -> Dict[str, float]:
303    def build_g_eval_score(self, run_output: RunOutput) -> EvalScores:
304        """
305        Build the G-Eval score for the given run and run output.
306
307        We create a weighted average of each rating using the logprobs.
308
309        @misc{liu2023gevalnlgevaluationusing,
310            title={G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment},
311            author={Yang Liu and Dan Iter and Yichong Xu and Shuohang Wang and Ruochen Xu and Chenguang Zhu},
312            year={2023},
313            eprint={2303.16634},
314            archivePrefix={arXiv},
315            primaryClass={cs.CL},
316            url={https://arxiv.org/abs/2303.16634},
317        }
318        """
319        # We use structured output
320        outputs = run_output.output
321        assert isinstance(outputs, dict)
322
323        # Build raw string output from the logprobs, which is easier to work with than Dict for the next bit
324        raw_output = self.raw_output_from_logprobs(run_output)
325
326        # find the offset the start of each metric in the raw output json
327        metrics: List[str] = list(outputs.keys())
328        metric_offsets = self.metric_offsets(raw_output, metrics)
329
330        final_scores: EvalScores = {}
331        for metric in metrics:
332            score = self.g_eval_single_metric(
333                run_output, metric, metric_offsets, raw_output
334            )
335            if score is None:
336                raise ValueError(
337                    f"No score found for metric: {metric}. The LLM failed to follow the scoring rubric/instructions/schema."
338                )
339            final_scores[metric] = score
340
341        return final_scores

Build the G-Eval score for the given run and run output.

We create a weighted average of each rating using the logprobs.

@misc{liu2023gevalnlgevaluationusing, title={G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment}, author={Yang Liu and Dan Iter and Yichong Xu and Shuohang Wang and Ruochen Xu and Chenguang Zhu}, year={2023}, eprint={2303.16634}, archivePrefix={arXiv}, primaryClass={cs.CL}, url={https://arxiv.org/abs/2303.16634}, }

def g_eval_single_metric( self, run_output: kiln_ai.adapters.run_output.RunOutput, metric: str, metric_offsets: Dict[str, int], raw_output: str) -> float | None:
343    def g_eval_single_metric(
344        self,
345        run_output: RunOutput,
346        metric: str,
347        metric_offsets: Dict[str, int],
348        raw_output: str,
349    ) -> float | None:
350        """
351        Run the G-Eval for a single metric.
352
353        Scan the logprobs for the metric and return the weighted score of the rating token.
354        """
355
356        start_offset, end_offset = self.token_search_range(
357            raw_output, metric, metric_offsets
358        )
359
360        offset = 0
361
362        if (
363            run_output.output_logprobs is None
364            or run_output.output_logprobs.content is None
365        ):
366            raise RuntimeError(
367                "No logprobs found for output - can not calculate g-eval"
368            )
369
370        # scan the tokens in the range, looking for the rating token
371        for _, chat_logprob in enumerate(run_output.output_logprobs.content):
372            if offset >= end_offset:
373                break
374            if offset >= start_offset:
375                score = self.rating_token_to_score(chat_logprob)
376                if score is not None:
377                    return score
378            offset += len(chat_logprob.token)
379
380        return None

Run the G-Eval for a single metric.

Scan the logprobs for the metric and return the weighted score of the rating token.

def raw_output_from_logprobs(self, run_output: kiln_ai.adapters.run_output.RunOutput) -> str:
382    def raw_output_from_logprobs(self, run_output: RunOutput) -> str:
383        """
384        Build the raw output string from the logprobs. Generate from logprobs so it's guaranteed to match the logprobs offsets
385        """
386        if (
387            run_output.output_logprobs is None
388            or run_output.output_logprobs.content is None
389        ):
390            raise RuntimeError(
391                "No logprobs found for output - can not calculate g-eval"
392            )
393
394        raw = ""
395        for chat_logprob in run_output.output_logprobs.content:
396            raw += chat_logprob.token
397        return raw

Build the raw output string from the logprobs. Generate from logprobs so it's guaranteed to match the logprobs offsets

def token_search_range( self, raw_output: str, metric: str, metric_offsets: Dict[str, int]) -> Tuple[int, int]:
399    def token_search_range(
400        self, raw_output: str, metric: str, metric_offsets: Dict[str, int]
401    ) -> Tuple[int, int]:
402        """
403        Find the start and end offsets of the metric in the raw output.
404
405        Start searching after the end of the target metric json entry ("overall_rating":), and before the start of the next metric ("some_other_score").
406        """
407        start_offset = metric_offsets[metric] + len(metric)
408
409        # Find the lowest end offset that is greater than the start offset
410        end_offset = len(raw_output)
411        for v in list(metric_offsets.values()):
412            if v < end_offset and v > start_offset:
413                end_offset = v
414
415        return start_offset, end_offset

Find the start and end offsets of the metric in the raw output.

Start searching after the end of the target metric json entry ("overall_rating":), and before the start of the next metric ("some_other_score").

def rating_token_to_score( self, token_logprob: litellm.types.utils.ChatCompletionTokenLogprob) -> float | None:
417    def rating_token_to_score(
418        self, token_logprob: ChatCompletionTokenLogprob
419    ) -> float | None:
420        """
421        Convert a rating token to a score using weighted average of top logprobs.
422
423        Only includes tokens that have valid scores.
424
425        Some cleanup for upper case, whitespace and quotes. LLMs aren't always consistent.
426        """
427        primary_token_score = self.score_from_token_string(token_logprob.token)
428        # check this is a real rating token, it could just be the ": ", "," or whitespace
429        if primary_token_score is None:
430            return None
431
432        total_score = 0.0
433        total_probability = 0.0
434        top_logprobs_contains_primary_token = False
435
436        # Process all valid scoring tokens from alternatives
437        for top_logprob in token_logprob.top_logprobs:
438            if top_logprob.token == token_logprob.token:
439                top_logprobs_contains_primary_token = True
440            token_score = self.score_from_token_string(top_logprob.token)
441            if token_score is not None:
442                # Convert logprob to probability
443                probability = math.exp(top_logprob.logprob)
444                total_score += token_score * probability
445                total_probability += probability
446
447        # Weird OpenAI 4o bug - sometimes the primary token is included in the top logprobs, sometimes not.
448        # Add the primary token back in if excluded
449        if not top_logprobs_contains_primary_token:
450            if token_logprob.logprob == -9999.0:
451                # Another "bug" - sometimes the logprob is -9999.0. This seems to happen when the rest of the logprobs are tiny probability.
452                total_score += primary_token_score * 1.0
453                total_probability += 1.0
454            else:
455                probability = math.exp(token_logprob.logprob)
456                total_score += primary_token_score * probability
457                total_probability += probability
458
459        if total_probability <= 0.0:
460            raise RuntimeError(
461                f"No valid scoring tokens found for {token_logprob.token}. This should never happen as the token has a valid score (so it must be excluded from top logprobs). Please file a bug if you see this."
462            )
463
464        # Normalize by total probability of valid tokens (LLM may have wanted to generate other non-rating tokens, these shouldn't lower score of rating tokens)
465        weighted_score = total_score / total_probability
466
467        return weighted_score

Convert a rating token to a score using weighted average of top logprobs.

Only includes tokens that have valid scores.

Some cleanup for upper case, whitespace and quotes. LLMs aren't always consistent.

def score_from_token_string(self, token: str) -> float | None:
469    def score_from_token_string(self, token: str) -> float | None:
470        if token in TOKEN_TO_SCORE_MAP:
471            return TOKEN_TO_SCORE_MAP[token]
472
473        # handle more token variations like '"1"' and '"pass"' and ' paSS' and 'PASS'
474        unquoted_token = token.strip().strip('"').lower()
475        if unquoted_token in TOKEN_TO_SCORE_MAP:
476            return TOKEN_TO_SCORE_MAP[unquoted_token]
477
478        # handle numeric tokens like "1.0"
479        try:
480            float_value = float(token)
481            if float_value.is_integer():
482                str_token = str(int(float_value))
483                if str_token in TOKEN_TO_SCORE_MAP:
484                    return TOKEN_TO_SCORE_MAP[str_token]
485        except ValueError:
486            pass
487
488        return None
def metric_offsets(self, raw_output: str, metrics: List[str]) -> Dict[str, int]:
490    def metric_offsets(self, raw_output: str, metrics: List[str]) -> Dict[str, int]:
491        """
492        Find the offset to the start of each metric in the raw output json
493
494        For the example json: `{"overall_rating": 1}` == 1
495
496        should return:
497        {
498            "overall_rating": 1 # it's 1 character into the json string
499        }
500        """
501        metric_offsets: Dict[str, int] = {}
502        for metric in metrics:
503            # the quoted metric name is expected in the json: `{"overall_rating": 1}` == 1
504            metric_name = f'"{metric}"'
505
506            # we expect it exactly once
507            count = raw_output.count(metric_name)
508            if count != 1:
509                raise ValueError(
510                    f"Metric {metric} should appear exactly once in the output. Found {count} times"
511                )
512
513            offset = raw_output.find(metric_name)
514            if offset == -1:
515                raise ValueError(f"Metric {metric} not found in raw output")
516            metric_offsets[metric] = offset
517        return metric_offsets

Find the offset to the start of each metric in the raw output json

For the example json: {"overall_rating": 1} == 1

should return: { "overall_rating": 1 # it's 1 character into the json string }