kiln_ai.adapters.eval.g_eval

  1import math
  2from typing import Dict, List, Tuple
  3
  4from litellm.types.utils import ChatCompletionTokenLogprob
  5
  6from kiln_ai.adapters.adapter_registry import adapter_for_task
  7from kiln_ai.adapters.eval.base_eval import BaseEval
  8from kiln_ai.adapters.eval.eval_utils.eval_trace_formatter import EvalTraceFormatter
  9from kiln_ai.adapters.eval.eval_utils.eval_utils import EvalUtils
 10from kiln_ai.adapters.ml_model_list import (
 11    default_structured_output_mode_for_model_provider,
 12)
 13from kiln_ai.adapters.model_adapters.base_adapter import AdapterConfig, RunOutput
 14from kiln_ai.adapters.prompt_builders import PromptGenerators
 15from kiln_ai.datamodel import Project, Task, TaskRun
 16from kiln_ai.datamodel.eval import EvalConfig, EvalConfigType, EvalDataType, EvalScores
 17from kiln_ai.datamodel.run_config import KilnAgentRunConfigProperties
 18from kiln_ai.datamodel.task import RunConfigProperties, StructuredOutputMode
 19
 20# all the tokens we score for, and their float scores.
 21TOKEN_TO_SCORE_MAP: Dict[str, float] = {
 22    "1": 1.0,
 23    "2": 2.0,
 24    "3": 3.0,
 25    "4": 4.0,
 26    "5": 5.0,
 27    "pass": 1.0,
 28    "fail": 0.0,
 29    "critical": -1.0,
 30}
 31
 32
 33class GEvalTask(Task, parent_of={}):
 34    """
 35    Kiln task for executing a G-Eval. Can be run on any Kiln adapter which supports logprobs.
 36
 37    Note G-Eval implements both G-Eval and LLM as Judge as they are very similar.
 38    """
 39
 40    def __init__(self, eval_config: EvalConfig):
 41        tmp_project = Project(name="GEval")
 42
 43        # Build a simple LLM as Judge system instruction
 44        system_instruction = "Your job to evaluate a model's performance on a task. Blocks will be marked with <eval_data> tags.\n"
 45        # Optionally add a short task description
 46        task_description = eval_config.properties.get("task_description", None)
 47        if task_description:
 48            system_instruction += f"\nThe task the model was given is as follows:\n<eval_data>\n<task_description>{task_description}</task_description>\n</eval_data>\n"
 49
 50        # Build the COT eval instructions
 51        steps = eval_config.properties.get("eval_steps", [])
 52        if not isinstance(steps, list):
 53            raise ValueError("eval_steps must be a list.")
 54        if len(steps) == 1:
 55            cot_instructions = "First, think step by step about the model's performance following this evaluation step:\n\n"
 56            cot_instructions += f"{steps[0]}\n"
 57        else:
 58            cot_instructions = "First, think step by step about the model's performance following these evaluation steps:\n\n"
 59            for i, step in enumerate(steps):
 60                cot_instructions += f"{i + 1}) {step}\n"
 61
 62        eval = eval_config.parent_eval()
 63        if not eval:
 64            raise ValueError("Eval config must have a parent eval")
 65
 66        # Build the output schema from the eval's target output scores.
 67        # We restrict the LLM's output scoring schema to discrete scores (pass/fail/critical/1-5) - allow_float_scores=False
 68        # However, the final scores from the evaluator can be a float (see later logprob calculation, which requires discrete token outputs)
 69        output_schema = BaseEval.build_score_schema(eval, allow_float_scores=False)
 70
 71        super().__init__(
 72            name="GEval Task",
 73            parent=tmp_project,
 74            instruction=system_instruction,
 75            thinking_instruction=cot_instructions,
 76            output_json_schema=output_schema,
 77        )
 78
 79
 80class GEval(BaseEval):
 81    """
 82    A evaluator which implements G-Eval and LLM as Judge.
 83
 84    G-Eval is a method of evaluating the quality of a model's output. It is a weighted average of the scores of the tokens in the output. The weights are the log probabilities of the tokens in the output. https://arxiv.org/abs/2303.16634
 85
 86    LLM as Judge is a method of evaluating the quality of a model's output. It simply asks the LLM to score, and uses the returned output (no logprobs needed). Also called direct evaluation.
 87
 88    @misc{liu2023gevalnlgevaluationusing,
 89        title={G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment},
 90        author={Yang Liu and Dan Iter and Yichong Xu and Shuohang Wang and Ruochen Xu and Chenguang Zhu},
 91        year={2023},
 92        eprint={2303.16634},
 93        archivePrefix={arXiv},
 94        primaryClass={cs.CL},
 95        url={https://arxiv.org/abs/2303.16634},
 96    }
 97    """
 98
 99    def __init__(self, eval_config: EvalConfig, run_config: RunConfigProperties | None):
100        if (
101            eval_config.config_type != EvalConfigType.g_eval
102            and eval_config.config_type != EvalConfigType.llm_as_judge
103        ):
104            raise ValueError(
105                f"GEval must be initialized with a GEval or LLM as Judge config_type. Got {eval_config.config_type}"
106            )
107
108        super().__init__(eval_config, run_config)
109
110        self.geval_task = GEvalTask(eval_config)
111
112    def generate_final_answer_run_description(
113        self, eval_input: str, eval_output: str
114    ) -> str:
115        return f"""The model was given the following input for the task: 
116<eval_data>
117{eval_input}
118</eval_data>
119
120The model produced the following output for the task:
121<eval_data>
122{eval_output}
123</eval_data>
124"""
125
126    def generate_ref_ans_run_description(
127        self, eval_input: str, eval_output: str, reference_answer: str
128    ) -> str:
129        return f"""The model was given the following input for the task: 
130<eval_data>
131{eval_input}
132</eval_data>
133
134The model produced the following output for the task:
135<eval_data>
136{eval_output}
137</eval_data>
138
139This is the reference answer:
140<eval_data>
141{reference_answer}
142</eval_data>
143"""
144
145    def generate_full_trace_run_description(
146        self,
147        eval_input: str,
148        available_tools: str | None,
149        conversation_history: str,
150    ) -> str:
151        description = ""
152        description += f"""The model was given the following <user_input> for the <task_description>: 
153<eval_data>
154<user_input>{eval_input}</user_input>
155</eval_data>
156"""
157        # Get properties from spec if available, otherwise from eval.template_properties (for legacy evals)
158        spec = self.eval.associated_spec(readonly=True)
159
160        # Spec uses different keys than legacy eval template_properties
161        if spec:
162            # Spec: tool_use_guidelines, appropriate_tool_use_examples, inappropriate_tool_use_examples
163            tool_use_guidelines = str(spec.properties.get("tool_use_guidelines") or "")
164            appropriate_tool_use_examples = str(
165                spec.properties.get("appropriate_tool_use_examples") or ""
166            )
167            inappropriate_tool_use_examples = str(
168                spec.properties.get("inappropriate_tool_use_examples") or ""
169            )
170            description += f"""The model was given the following <tool_use_guidelines>:
171<eval_data>
172<tool_use_guidelines>
173{tool_use_guidelines}
174</tool_use_guidelines>
175</eval_data>
176"""
177            description += f"""The model was given the following <appropriate_tool_use_examples>:
178<eval_data>
179<appropriate_tool_use_examples>
180{appropriate_tool_use_examples}
181</appropriate_tool_use_examples>
182</eval_data>
183"""
184            description += f"""The model was given the following <inappropriate_tool_use_examples>:
185<eval_data>
186<inappropriate_tool_use_examples>
187{inappropriate_tool_use_examples}
188</inappropriate_tool_use_examples>
189</eval_data>
190"""
191        elif self.eval.template_properties:
192            # Legacy eval: appropriate_tool_use_guidelines, inappropriate_tool_use_guidelines
193            appropriate_tool_use_guidelines = str(
194                self.eval.template_properties.get("appropriate_tool_use_guidelines")
195                or ""
196            )
197            inappropriate_tool_use_guidelines = str(
198                self.eval.template_properties.get("inappropriate_tool_use_guidelines")
199                or ""
200            )
201
202            description += f"""The model was given the following <appropriate_tool_use_guidelines> guidelines: 
203<eval_data>
204<appropriate_tool_use_guidelines>
205{appropriate_tool_use_guidelines}
206</appropriate_tool_use_guidelines>
207</eval_data>
208"""
209            # Only include if it has content since it is optional
210            if inappropriate_tool_use_guidelines:
211                description += f"""The model was given the following <inappropriate_tool_use_guidelines> guidelines: 
212<eval_data>
213<inappropriate_tool_use_guidelines>
214{inappropriate_tool_use_guidelines}
215</inappropriate_tool_use_guidelines>
216</eval_data>
217"""
218
219        if available_tools is not None:
220            if available_tools != "":
221                description += f"""
222This is the list of tools available to the model:
223<eval_data>
224<available_tools>{available_tools}</available_tools>
225</eval_data>
226"""
227            else:
228                description += """
229There were no tools available to the model.
230"""
231
232        description += f"""
233This is the full conversation history for the task run:
234<eval_data>
235<conversation_history>{conversation_history}</conversation_history>
236</eval_data>
237"""
238        return description
239
240    async def run_eval(
241        self, task_run: TaskRun, eval_job_item: TaskRun | None = None
242    ) -> tuple[EvalScores, Dict[str, str] | None]:
243        """
244        Run this eval on the given task run.
245        """
246
247        model_name, provider = self.model_and_provider()
248
249        # Only fetch logprobs for G-Eval
250        # There are at most 5 valid rating tokens per rating type (five_star being largest), so 10 is more than enough to get to the very very unlikely
251        top_logprobs = (
252            10 if self.eval_config.config_type == EvalConfigType.g_eval else None
253        )
254
255        # We don't expose setting this manually in the UI, so pull a recommended mode from ml_model_list
256        structured_output_mode = default_structured_output_mode_for_model_provider(
257            model_name,
258            provider,
259            default=StructuredOutputMode.json_schema,
260            # G-eval expects JSON, so don't allow function calling modes
261            disallowed_modes=[
262                StructuredOutputMode.function_calling,
263                StructuredOutputMode.function_calling_weak,
264            ],
265        )
266
267        adapter = adapter_for_task(
268            self.geval_task,
269            run_config_properties=KilnAgentRunConfigProperties(
270                model_name=model_name,
271                model_provider_name=provider,
272                # We always use Simple COT for G-Eval and LLM as Judge
273                prompt_id=PromptGenerators.SIMPLE_CHAIN_OF_THOUGHT,
274                structured_output_mode=structured_output_mode,
275            ),
276            base_adapter_config=AdapterConfig(
277                # Don't save this run into the task_runs. It will be saved into an eval_run where it belongs
278                allow_saving=False,
279                top_logprobs=top_logprobs,
280            ),
281        )
282
283        if self.eval.evaluation_data_type == EvalDataType.full_trace:
284            if task_run.trace is None:
285                raise ValueError("Task run trace is required for full trace evaluation")
286
287            available_tools = await EvalUtils.formatted_available_tools_from_task_run(
288                task_run
289            )
290            run_description = self.generate_full_trace_run_description(
291                task_run.input,
292                available_tools,
293                EvalTraceFormatter.trace_to_formatted_conversation_history(
294                    task_run.trace
295                ),
296            )
297
298        elif self.eval.evaluation_data_type == EvalDataType.reference_answer:
299            if eval_job_item is None:
300                raise ValueError(
301                    "Eval job item is required for reference answer evaluation"
302                )
303            run_description = self.generate_ref_ans_run_description(
304                task_run.input, task_run.output.output, eval_job_item.output.output
305            )
306
307        else:  # EvalDataType.final_answer
308            run_description = self.generate_final_answer_run_description(
309                task_run.input, task_run.output.output
310            )
311
312        # We don't need the run, but invoke_returning_run_output() runs validations for us over _run()
313        _, run_output = await adapter.invoke_returning_run_output(run_description)
314
315        if self.eval_config.config_type == EvalConfigType.llm_as_judge:
316            return self.build_llm_as_judge_score(
317                run_output
318            ), run_output.intermediate_outputs
319        else:
320            return self.build_g_eval_score(run_output), run_output.intermediate_outputs
321
322    def build_llm_as_judge_score(self, run_output: RunOutput) -> EvalScores:
323        """
324        Build the LLM as Judge score for the given run and run output.
325        """
326        # Convert the output format we asked for (discreet values) to our float scores
327        scores: EvalScores = {}
328        if not isinstance(run_output.output, dict):
329            raise ValueError("LLM as Judge output must be a dictionary")
330
331        for metric, score in run_output.output.items():
332            token_score = self.score_from_token_string(f"{score}")
333            if token_score is None:
334                raise ValueError(
335                    f"No score found for metric: {metric}. The LLM failed to follow the scoring rubric/instructions/schema."
336                )
337            scores[metric] = token_score
338        return scores
339
340    def build_g_eval_score(self, run_output: RunOutput) -> EvalScores:
341        """
342        Build the G-Eval score for the given run and run output.
343
344        We create a weighted average of each rating using the logprobs.
345
346        @misc{liu2023gevalnlgevaluationusing,
347            title={G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment},
348            author={Yang Liu and Dan Iter and Yichong Xu and Shuohang Wang and Ruochen Xu and Chenguang Zhu},
349            year={2023},
350            eprint={2303.16634},
351            archivePrefix={arXiv},
352            primaryClass={cs.CL},
353            url={https://arxiv.org/abs/2303.16634},
354        }
355        """
356        # We use structured output
357        outputs = run_output.output
358        assert isinstance(outputs, dict)
359
360        # Build raw string output from the logprobs, which is easier to work with than Dict for the next bit
361        raw_output = self.raw_output_from_logprobs(run_output)
362
363        # find the offset the start of each metric in the raw output json
364        metrics: List[str] = list(outputs.keys())
365        metric_offsets = self.metric_offsets(raw_output, metrics)
366
367        final_scores: EvalScores = {}
368        for metric in metrics:
369            score = self.g_eval_single_metric(
370                run_output, metric, metric_offsets, raw_output
371            )
372            if score is None:
373                raise ValueError(
374                    f"No score found for metric: {metric}. The LLM failed to follow the scoring rubric/instructions/schema."
375                )
376            final_scores[metric] = score
377
378        return final_scores
379
380    def g_eval_single_metric(
381        self,
382        run_output: RunOutput,
383        metric: str,
384        metric_offsets: Dict[str, int],
385        raw_output: str,
386    ) -> float | None:
387        """
388        Run the G-Eval for a single metric.
389
390        Scan the logprobs for the metric and return the weighted score of the rating token.
391        """
392
393        start_offset, end_offset = self.token_search_range(
394            raw_output, metric, metric_offsets
395        )
396
397        offset = 0
398
399        if (
400            run_output.output_logprobs is None
401            or run_output.output_logprobs.content is None
402        ):
403            raise RuntimeError(
404                "No logprobs found for output - can not calculate g-eval"
405            )
406
407        # scan the tokens in the range, looking for the rating token
408        for _, chat_logprob in enumerate(run_output.output_logprobs.content):
409            if offset >= end_offset:
410                break
411            if offset >= start_offset:
412                score = self.rating_token_to_score(chat_logprob)
413                if score is not None:
414                    return score
415            offset += len(chat_logprob.token)
416
417        return None
418
419    def raw_output_from_logprobs(self, run_output: RunOutput) -> str:
420        """
421        Build the raw output string from the logprobs. Generate from logprobs so it's guaranteed to match the logprobs offsets
422        """
423        if (
424            run_output.output_logprobs is None
425            or run_output.output_logprobs.content is None
426        ):
427            raise RuntimeError(
428                "No logprobs found for output - can not calculate g-eval"
429            )
430
431        raw = ""
432        for chat_logprob in run_output.output_logprobs.content:
433            raw += chat_logprob.token
434        return raw
435
436    def token_search_range(
437        self, raw_output: str, metric: str, metric_offsets: Dict[str, int]
438    ) -> Tuple[int, int]:
439        """
440        Find the start and end offsets of the metric in the raw output.
441
442        Start searching after the end of the target metric json entry ("overall_rating":), and before the start of the next metric ("some_other_score").
443        """
444        start_offset = metric_offsets[metric] + len(metric)
445
446        # Find the lowest end offset that is greater than the start offset
447        end_offset = len(raw_output)
448        for v in list(metric_offsets.values()):
449            if v < end_offset and v > start_offset:
450                end_offset = v
451
452        return start_offset, end_offset
453
454    def rating_token_to_score(
455        self, token_logprob: ChatCompletionTokenLogprob
456    ) -> float | None:
457        """
458        Convert a rating token to a score using weighted average of top logprobs.
459
460        Only includes tokens that have valid scores.
461
462        Some cleanup for upper case, whitespace and quotes. LLMs aren't always consistent.
463        """
464        primary_token_score = self.score_from_token_string(token_logprob.token)
465        # check this is a real rating token, it could just be the ": ", "," or whitespace
466        if primary_token_score is None:
467            return None
468
469        total_score = 0.0
470        total_probability = 0.0
471        top_logprobs_contains_primary_token = False
472
473        # Process all valid scoring tokens from alternatives
474        for top_logprob in token_logprob.top_logprobs:
475            if top_logprob.token == token_logprob.token:
476                top_logprobs_contains_primary_token = True
477            token_score = self.score_from_token_string(top_logprob.token)
478            if token_score is not None:
479                # Convert logprob to probability
480                probability = math.exp(top_logprob.logprob)
481                total_score += token_score * probability
482                total_probability += probability
483
484        # Weird OpenAI 4o bug - sometimes the primary token is included in the top logprobs, sometimes not.
485        # Add the primary token back in if excluded
486        if not top_logprobs_contains_primary_token:
487            if token_logprob.logprob == -9999.0:
488                # Another "bug" - sometimes the logprob is -9999.0. This seems to happen when the rest of the logprobs are tiny probability.
489                total_score += primary_token_score * 1.0
490                total_probability += 1.0
491            else:
492                probability = math.exp(token_logprob.logprob)
493                total_score += primary_token_score * probability
494                total_probability += probability
495
496        if total_probability <= 0.0:
497            raise RuntimeError(
498                f"No valid scoring tokens found for {token_logprob.token}. This should never happen as the token has a valid score (so it must be excluded from top logprobs). Please file a bug if you see this."
499            )
500
501        # Normalize by total probability of valid tokens (LLM may have wanted to generate other non-rating tokens, these shouldn't lower score of rating tokens)
502        weighted_score = total_score / total_probability
503
504        return weighted_score
505
506    def score_from_token_string(self, token: str) -> float | None:
507        if token in TOKEN_TO_SCORE_MAP:
508            return TOKEN_TO_SCORE_MAP[token]
509
510        # handle more token variations like '"1"' and '"pass"' and ' paSS' and 'PASS'
511        unquoted_token = token.strip().strip('"').lower()
512        if unquoted_token in TOKEN_TO_SCORE_MAP:
513            return TOKEN_TO_SCORE_MAP[unquoted_token]
514
515        # handle numeric tokens like "1.0"
516        try:
517            float_value = float(token)
518            if float_value.is_integer():
519                str_token = str(int(float_value))
520                if str_token in TOKEN_TO_SCORE_MAP:
521                    return TOKEN_TO_SCORE_MAP[str_token]
522        except ValueError:
523            pass
524
525        return None
526
527    def metric_offsets(self, raw_output: str, metrics: List[str]) -> Dict[str, int]:
528        """
529        Find the offset to the start of each metric in the raw output json
530
531        For the example json: `{"overall_rating": 1}` == 1
532
533        should return:
534        {
535            "overall_rating": 1 # it's 1 character into the json string
536        }
537        """
538        metric_offsets: Dict[str, int] = {}
539        for metric in metrics:
540            # the quoted metric name is expected in the json: `{"overall_rating": 1}` == 1
541            metric_name = f'"{metric}"'
542
543            # we expect it exactly once
544            count = raw_output.count(metric_name)
545            if count != 1:
546                raise ValueError(
547                    f"Metric {metric} should appear exactly once in the output. Found {count} times"
548                )
549
550            offset = raw_output.find(metric_name)
551            if offset == -1:
552                raise ValueError(f"Metric {metric} not found in raw output")
553            metric_offsets[metric] = offset
554        return metric_offsets
TOKEN_TO_SCORE_MAP: Dict[str, float] = {'1': 1.0, '2': 2.0, '3': 3.0, '4': 4.0, '5': 5.0, 'pass': 1.0, 'fail': 0.0, 'critical': -1.0}
class GEvalTask(kiln_ai.datamodel.task.Task):
34class GEvalTask(Task, parent_of={}):
35    """
36    Kiln task for executing a G-Eval. Can be run on any Kiln adapter which supports logprobs.
37
38    Note G-Eval implements both G-Eval and LLM as Judge as they are very similar.
39    """
40
41    def __init__(self, eval_config: EvalConfig):
42        tmp_project = Project(name="GEval")
43
44        # Build a simple LLM as Judge system instruction
45        system_instruction = "Your job to evaluate a model's performance on a task. Blocks will be marked with <eval_data> tags.\n"
46        # Optionally add a short task description
47        task_description = eval_config.properties.get("task_description", None)
48        if task_description:
49            system_instruction += f"\nThe task the model was given is as follows:\n<eval_data>\n<task_description>{task_description}</task_description>\n</eval_data>\n"
50
51        # Build the COT eval instructions
52        steps = eval_config.properties.get("eval_steps", [])
53        if not isinstance(steps, list):
54            raise ValueError("eval_steps must be a list.")
55        if len(steps) == 1:
56            cot_instructions = "First, think step by step about the model's performance following this evaluation step:\n\n"
57            cot_instructions += f"{steps[0]}\n"
58        else:
59            cot_instructions = "First, think step by step about the model's performance following these evaluation steps:\n\n"
60            for i, step in enumerate(steps):
61                cot_instructions += f"{i + 1}) {step}\n"
62
63        eval = eval_config.parent_eval()
64        if not eval:
65            raise ValueError("Eval config must have a parent eval")
66
67        # Build the output schema from the eval's target output scores.
68        # We restrict the LLM's output scoring schema to discrete scores (pass/fail/critical/1-5) - allow_float_scores=False
69        # However, the final scores from the evaluator can be a float (see later logprob calculation, which requires discrete token outputs)
70        output_schema = BaseEval.build_score_schema(eval, allow_float_scores=False)
71
72        super().__init__(
73            name="GEval Task",
74            parent=tmp_project,
75            instruction=system_instruction,
76            thinking_instruction=cot_instructions,
77            output_json_schema=output_schema,
78        )

Kiln task for executing a G-Eval. Can be run on any Kiln adapter which supports logprobs.

Note G-Eval implements both G-Eval and LLM as Judge as they are very similar.

GEvalTask(eval_config: kiln_ai.datamodel.eval.EvalConfig)
41    def __init__(self, eval_config: EvalConfig):
42        tmp_project = Project(name="GEval")
43
44        # Build a simple LLM as Judge system instruction
45        system_instruction = "Your job to evaluate a model's performance on a task. Blocks will be marked with <eval_data> tags.\n"
46        # Optionally add a short task description
47        task_description = eval_config.properties.get("task_description", None)
48        if task_description:
49            system_instruction += f"\nThe task the model was given is as follows:\n<eval_data>\n<task_description>{task_description}</task_description>\n</eval_data>\n"
50
51        # Build the COT eval instructions
52        steps = eval_config.properties.get("eval_steps", [])
53        if not isinstance(steps, list):
54            raise ValueError("eval_steps must be a list.")
55        if len(steps) == 1:
56            cot_instructions = "First, think step by step about the model's performance following this evaluation step:\n\n"
57            cot_instructions += f"{steps[0]}\n"
58        else:
59            cot_instructions = "First, think step by step about the model's performance following these evaluation steps:\n\n"
60            for i, step in enumerate(steps):
61                cot_instructions += f"{i + 1}) {step}\n"
62
63        eval = eval_config.parent_eval()
64        if not eval:
65            raise ValueError("Eval config must have a parent eval")
66
67        # Build the output schema from the eval's target output scores.
68        # We restrict the LLM's output scoring schema to discrete scores (pass/fail/critical/1-5) - allow_float_scores=False
69        # However, the final scores from the evaluator can be a float (see later logprob calculation, which requires discrete token outputs)
70        output_schema = BaseEval.build_score_schema(eval, allow_float_scores=False)
71
72        super().__init__(
73            name="GEval Task",
74            parent=tmp_project,
75            instruction=system_instruction,
76            thinking_instruction=cot_instructions,
77            output_json_schema=output_schema,
78        )

Create a new model by parsing and validating input data from keyword arguments.

Raises [ValidationError][pydantic_core.ValidationError] if the input data cannot be validated to form a valid model.

self is explicitly positional-only to allow self as a field name.

model_config = {'validate_assignment': True}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

def model_post_init(self: pydantic.main.BaseModel, context: Any, /) -> None:
337def init_private_attributes(self: BaseModel, context: Any, /) -> None:
338    """This function is meant to behave like a BaseModel method to initialise private attributes.
339
340    It takes context as an argument since that's what pydantic-core passes when calling it.
341
342    Args:
343        self: The BaseModel instance.
344        context: The context.
345    """
346    if getattr(self, '__pydantic_private__', None) is None:
347        pydantic_private = {}
348        for name, private_attr in self.__private_attributes__.items():
349            default = private_attr.get_default()
350            if default is not PydanticUndefined:
351                pydantic_private[name] = default
352        object_setattr(self, '__pydantic_private__', pydantic_private)

This function is meant to behave like a BaseModel method to initialise private attributes.

It takes context as an argument since that's what pydantic-core passes when calling it.

Args: self: The BaseModel instance. context: The context.

class GEval(kiln_ai.adapters.eval.base_eval.BaseEval):
 81class GEval(BaseEval):
 82    """
 83    A evaluator which implements G-Eval and LLM as Judge.
 84
 85    G-Eval is a method of evaluating the quality of a model's output. It is a weighted average of the scores of the tokens in the output. The weights are the log probabilities of the tokens in the output. https://arxiv.org/abs/2303.16634
 86
 87    LLM as Judge is a method of evaluating the quality of a model's output. It simply asks the LLM to score, and uses the returned output (no logprobs needed). Also called direct evaluation.
 88
 89    @misc{liu2023gevalnlgevaluationusing,
 90        title={G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment},
 91        author={Yang Liu and Dan Iter and Yichong Xu and Shuohang Wang and Ruochen Xu and Chenguang Zhu},
 92        year={2023},
 93        eprint={2303.16634},
 94        archivePrefix={arXiv},
 95        primaryClass={cs.CL},
 96        url={https://arxiv.org/abs/2303.16634},
 97    }
 98    """
 99
100    def __init__(self, eval_config: EvalConfig, run_config: RunConfigProperties | None):
101        if (
102            eval_config.config_type != EvalConfigType.g_eval
103            and eval_config.config_type != EvalConfigType.llm_as_judge
104        ):
105            raise ValueError(
106                f"GEval must be initialized with a GEval or LLM as Judge config_type. Got {eval_config.config_type}"
107            )
108
109        super().__init__(eval_config, run_config)
110
111        self.geval_task = GEvalTask(eval_config)
112
113    def generate_final_answer_run_description(
114        self, eval_input: str, eval_output: str
115    ) -> str:
116        return f"""The model was given the following input for the task: 
117<eval_data>
118{eval_input}
119</eval_data>
120
121The model produced the following output for the task:
122<eval_data>
123{eval_output}
124</eval_data>
125"""
126
127    def generate_ref_ans_run_description(
128        self, eval_input: str, eval_output: str, reference_answer: str
129    ) -> str:
130        return f"""The model was given the following input for the task: 
131<eval_data>
132{eval_input}
133</eval_data>
134
135The model produced the following output for the task:
136<eval_data>
137{eval_output}
138</eval_data>
139
140This is the reference answer:
141<eval_data>
142{reference_answer}
143</eval_data>
144"""
145
146    def generate_full_trace_run_description(
147        self,
148        eval_input: str,
149        available_tools: str | None,
150        conversation_history: str,
151    ) -> str:
152        description = ""
153        description += f"""The model was given the following <user_input> for the <task_description>: 
154<eval_data>
155<user_input>{eval_input}</user_input>
156</eval_data>
157"""
158        # Get properties from spec if available, otherwise from eval.template_properties (for legacy evals)
159        spec = self.eval.associated_spec(readonly=True)
160
161        # Spec uses different keys than legacy eval template_properties
162        if spec:
163            # Spec: tool_use_guidelines, appropriate_tool_use_examples, inappropriate_tool_use_examples
164            tool_use_guidelines = str(spec.properties.get("tool_use_guidelines") or "")
165            appropriate_tool_use_examples = str(
166                spec.properties.get("appropriate_tool_use_examples") or ""
167            )
168            inappropriate_tool_use_examples = str(
169                spec.properties.get("inappropriate_tool_use_examples") or ""
170            )
171            description += f"""The model was given the following <tool_use_guidelines>:
172<eval_data>
173<tool_use_guidelines>
174{tool_use_guidelines}
175</tool_use_guidelines>
176</eval_data>
177"""
178            description += f"""The model was given the following <appropriate_tool_use_examples>:
179<eval_data>
180<appropriate_tool_use_examples>
181{appropriate_tool_use_examples}
182</appropriate_tool_use_examples>
183</eval_data>
184"""
185            description += f"""The model was given the following <inappropriate_tool_use_examples>:
186<eval_data>
187<inappropriate_tool_use_examples>
188{inappropriate_tool_use_examples}
189</inappropriate_tool_use_examples>
190</eval_data>
191"""
192        elif self.eval.template_properties:
193            # Legacy eval: appropriate_tool_use_guidelines, inappropriate_tool_use_guidelines
194            appropriate_tool_use_guidelines = str(
195                self.eval.template_properties.get("appropriate_tool_use_guidelines")
196                or ""
197            )
198            inappropriate_tool_use_guidelines = str(
199                self.eval.template_properties.get("inappropriate_tool_use_guidelines")
200                or ""
201            )
202
203            description += f"""The model was given the following <appropriate_tool_use_guidelines> guidelines: 
204<eval_data>
205<appropriate_tool_use_guidelines>
206{appropriate_tool_use_guidelines}
207</appropriate_tool_use_guidelines>
208</eval_data>
209"""
210            # Only include if it has content since it is optional
211            if inappropriate_tool_use_guidelines:
212                description += f"""The model was given the following <inappropriate_tool_use_guidelines> guidelines: 
213<eval_data>
214<inappropriate_tool_use_guidelines>
215{inappropriate_tool_use_guidelines}
216</inappropriate_tool_use_guidelines>
217</eval_data>
218"""
219
220        if available_tools is not None:
221            if available_tools != "":
222                description += f"""
223This is the list of tools available to the model:
224<eval_data>
225<available_tools>{available_tools}</available_tools>
226</eval_data>
227"""
228            else:
229                description += """
230There were no tools available to the model.
231"""
232
233        description += f"""
234This is the full conversation history for the task run:
235<eval_data>
236<conversation_history>{conversation_history}</conversation_history>
237</eval_data>
238"""
239        return description
240
241    async def run_eval(
242        self, task_run: TaskRun, eval_job_item: TaskRun | None = None
243    ) -> tuple[EvalScores, Dict[str, str] | None]:
244        """
245        Run this eval on the given task run.
246        """
247
248        model_name, provider = self.model_and_provider()
249
250        # Only fetch logprobs for G-Eval
251        # There are at most 5 valid rating tokens per rating type (five_star being largest), so 10 is more than enough to get to the very very unlikely
252        top_logprobs = (
253            10 if self.eval_config.config_type == EvalConfigType.g_eval else None
254        )
255
256        # We don't expose setting this manually in the UI, so pull a recommended mode from ml_model_list
257        structured_output_mode = default_structured_output_mode_for_model_provider(
258            model_name,
259            provider,
260            default=StructuredOutputMode.json_schema,
261            # G-eval expects JSON, so don't allow function calling modes
262            disallowed_modes=[
263                StructuredOutputMode.function_calling,
264                StructuredOutputMode.function_calling_weak,
265            ],
266        )
267
268        adapter = adapter_for_task(
269            self.geval_task,
270            run_config_properties=KilnAgentRunConfigProperties(
271                model_name=model_name,
272                model_provider_name=provider,
273                # We always use Simple COT for G-Eval and LLM as Judge
274                prompt_id=PromptGenerators.SIMPLE_CHAIN_OF_THOUGHT,
275                structured_output_mode=structured_output_mode,
276            ),
277            base_adapter_config=AdapterConfig(
278                # Don't save this run into the task_runs. It will be saved into an eval_run where it belongs
279                allow_saving=False,
280                top_logprobs=top_logprobs,
281            ),
282        )
283
284        if self.eval.evaluation_data_type == EvalDataType.full_trace:
285            if task_run.trace is None:
286                raise ValueError("Task run trace is required for full trace evaluation")
287
288            available_tools = await EvalUtils.formatted_available_tools_from_task_run(
289                task_run
290            )
291            run_description = self.generate_full_trace_run_description(
292                task_run.input,
293                available_tools,
294                EvalTraceFormatter.trace_to_formatted_conversation_history(
295                    task_run.trace
296                ),
297            )
298
299        elif self.eval.evaluation_data_type == EvalDataType.reference_answer:
300            if eval_job_item is None:
301                raise ValueError(
302                    "Eval job item is required for reference answer evaluation"
303                )
304            run_description = self.generate_ref_ans_run_description(
305                task_run.input, task_run.output.output, eval_job_item.output.output
306            )
307
308        else:  # EvalDataType.final_answer
309            run_description = self.generate_final_answer_run_description(
310                task_run.input, task_run.output.output
311            )
312
313        # We don't need the run, but invoke_returning_run_output() runs validations for us over _run()
314        _, run_output = await adapter.invoke_returning_run_output(run_description)
315
316        if self.eval_config.config_type == EvalConfigType.llm_as_judge:
317            return self.build_llm_as_judge_score(
318                run_output
319            ), run_output.intermediate_outputs
320        else:
321            return self.build_g_eval_score(run_output), run_output.intermediate_outputs
322
323    def build_llm_as_judge_score(self, run_output: RunOutput) -> EvalScores:
324        """
325        Build the LLM as Judge score for the given run and run output.
326        """
327        # Convert the output format we asked for (discreet values) to our float scores
328        scores: EvalScores = {}
329        if not isinstance(run_output.output, dict):
330            raise ValueError("LLM as Judge output must be a dictionary")
331
332        for metric, score in run_output.output.items():
333            token_score = self.score_from_token_string(f"{score}")
334            if token_score is None:
335                raise ValueError(
336                    f"No score found for metric: {metric}. The LLM failed to follow the scoring rubric/instructions/schema."
337                )
338            scores[metric] = token_score
339        return scores
340
341    def build_g_eval_score(self, run_output: RunOutput) -> EvalScores:
342        """
343        Build the G-Eval score for the given run and run output.
344
345        We create a weighted average of each rating using the logprobs.
346
347        @misc{liu2023gevalnlgevaluationusing,
348            title={G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment},
349            author={Yang Liu and Dan Iter and Yichong Xu and Shuohang Wang and Ruochen Xu and Chenguang Zhu},
350            year={2023},
351            eprint={2303.16634},
352            archivePrefix={arXiv},
353            primaryClass={cs.CL},
354            url={https://arxiv.org/abs/2303.16634},
355        }
356        """
357        # We use structured output
358        outputs = run_output.output
359        assert isinstance(outputs, dict)
360
361        # Build raw string output from the logprobs, which is easier to work with than Dict for the next bit
362        raw_output = self.raw_output_from_logprobs(run_output)
363
364        # find the offset the start of each metric in the raw output json
365        metrics: List[str] = list(outputs.keys())
366        metric_offsets = self.metric_offsets(raw_output, metrics)
367
368        final_scores: EvalScores = {}
369        for metric in metrics:
370            score = self.g_eval_single_metric(
371                run_output, metric, metric_offsets, raw_output
372            )
373            if score is None:
374                raise ValueError(
375                    f"No score found for metric: {metric}. The LLM failed to follow the scoring rubric/instructions/schema."
376                )
377            final_scores[metric] = score
378
379        return final_scores
380
381    def g_eval_single_metric(
382        self,
383        run_output: RunOutput,
384        metric: str,
385        metric_offsets: Dict[str, int],
386        raw_output: str,
387    ) -> float | None:
388        """
389        Run the G-Eval for a single metric.
390
391        Scan the logprobs for the metric and return the weighted score of the rating token.
392        """
393
394        start_offset, end_offset = self.token_search_range(
395            raw_output, metric, metric_offsets
396        )
397
398        offset = 0
399
400        if (
401            run_output.output_logprobs is None
402            or run_output.output_logprobs.content is None
403        ):
404            raise RuntimeError(
405                "No logprobs found for output - can not calculate g-eval"
406            )
407
408        # scan the tokens in the range, looking for the rating token
409        for _, chat_logprob in enumerate(run_output.output_logprobs.content):
410            if offset >= end_offset:
411                break
412            if offset >= start_offset:
413                score = self.rating_token_to_score(chat_logprob)
414                if score is not None:
415                    return score
416            offset += len(chat_logprob.token)
417
418        return None
419
420    def raw_output_from_logprobs(self, run_output: RunOutput) -> str:
421        """
422        Build the raw output string from the logprobs. Generate from logprobs so it's guaranteed to match the logprobs offsets
423        """
424        if (
425            run_output.output_logprobs is None
426            or run_output.output_logprobs.content is None
427        ):
428            raise RuntimeError(
429                "No logprobs found for output - can not calculate g-eval"
430            )
431
432        raw = ""
433        for chat_logprob in run_output.output_logprobs.content:
434            raw += chat_logprob.token
435        return raw
436
437    def token_search_range(
438        self, raw_output: str, metric: str, metric_offsets: Dict[str, int]
439    ) -> Tuple[int, int]:
440        """
441        Find the start and end offsets of the metric in the raw output.
442
443        Start searching after the end of the target metric json entry ("overall_rating":), and before the start of the next metric ("some_other_score").
444        """
445        start_offset = metric_offsets[metric] + len(metric)
446
447        # Find the lowest end offset that is greater than the start offset
448        end_offset = len(raw_output)
449        for v in list(metric_offsets.values()):
450            if v < end_offset and v > start_offset:
451                end_offset = v
452
453        return start_offset, end_offset
454
455    def rating_token_to_score(
456        self, token_logprob: ChatCompletionTokenLogprob
457    ) -> float | None:
458        """
459        Convert a rating token to a score using weighted average of top logprobs.
460
461        Only includes tokens that have valid scores.
462
463        Some cleanup for upper case, whitespace and quotes. LLMs aren't always consistent.
464        """
465        primary_token_score = self.score_from_token_string(token_logprob.token)
466        # check this is a real rating token, it could just be the ": ", "," or whitespace
467        if primary_token_score is None:
468            return None
469
470        total_score = 0.0
471        total_probability = 0.0
472        top_logprobs_contains_primary_token = False
473
474        # Process all valid scoring tokens from alternatives
475        for top_logprob in token_logprob.top_logprobs:
476            if top_logprob.token == token_logprob.token:
477                top_logprobs_contains_primary_token = True
478            token_score = self.score_from_token_string(top_logprob.token)
479            if token_score is not None:
480                # Convert logprob to probability
481                probability = math.exp(top_logprob.logprob)
482                total_score += token_score * probability
483                total_probability += probability
484
485        # Weird OpenAI 4o bug - sometimes the primary token is included in the top logprobs, sometimes not.
486        # Add the primary token back in if excluded
487        if not top_logprobs_contains_primary_token:
488            if token_logprob.logprob == -9999.0:
489                # Another "bug" - sometimes the logprob is -9999.0. This seems to happen when the rest of the logprobs are tiny probability.
490                total_score += primary_token_score * 1.0
491                total_probability += 1.0
492            else:
493                probability = math.exp(token_logprob.logprob)
494                total_score += primary_token_score * probability
495                total_probability += probability
496
497        if total_probability <= 0.0:
498            raise RuntimeError(
499                f"No valid scoring tokens found for {token_logprob.token}. This should never happen as the token has a valid score (so it must be excluded from top logprobs). Please file a bug if you see this."
500            )
501
502        # Normalize by total probability of valid tokens (LLM may have wanted to generate other non-rating tokens, these shouldn't lower score of rating tokens)
503        weighted_score = total_score / total_probability
504
505        return weighted_score
506
507    def score_from_token_string(self, token: str) -> float | None:
508        if token in TOKEN_TO_SCORE_MAP:
509            return TOKEN_TO_SCORE_MAP[token]
510
511        # handle more token variations like '"1"' and '"pass"' and ' paSS' and 'PASS'
512        unquoted_token = token.strip().strip('"').lower()
513        if unquoted_token in TOKEN_TO_SCORE_MAP:
514            return TOKEN_TO_SCORE_MAP[unquoted_token]
515
516        # handle numeric tokens like "1.0"
517        try:
518            float_value = float(token)
519            if float_value.is_integer():
520                str_token = str(int(float_value))
521                if str_token in TOKEN_TO_SCORE_MAP:
522                    return TOKEN_TO_SCORE_MAP[str_token]
523        except ValueError:
524            pass
525
526        return None
527
528    def metric_offsets(self, raw_output: str, metrics: List[str]) -> Dict[str, int]:
529        """
530        Find the offset to the start of each metric in the raw output json
531
532        For the example json: `{"overall_rating": 1}` == 1
533
534        should return:
535        {
536            "overall_rating": 1 # it's 1 character into the json string
537        }
538        """
539        metric_offsets: Dict[str, int] = {}
540        for metric in metrics:
541            # the quoted metric name is expected in the json: `{"overall_rating": 1}` == 1
542            metric_name = f'"{metric}"'
543
544            # we expect it exactly once
545            count = raw_output.count(metric_name)
546            if count != 1:
547                raise ValueError(
548                    f"Metric {metric} should appear exactly once in the output. Found {count} times"
549                )
550
551            offset = raw_output.find(metric_name)
552            if offset == -1:
553                raise ValueError(f"Metric {metric} not found in raw output")
554            metric_offsets[metric] = offset
555        return metric_offsets

A evaluator which implements G-Eval and LLM as Judge.

G-Eval is a method of evaluating the quality of a model's output. It is a weighted average of the scores of the tokens in the output. The weights are the log probabilities of the tokens in the output. https://arxiv.org/abs/2303.16634

LLM as Judge is a method of evaluating the quality of a model's output. It simply asks the LLM to score, and uses the returned output (no logprobs needed). Also called direct evaluation.

@misc{liu2023gevalnlgevaluationusing, title={G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment}, author={Yang Liu and Dan Iter and Yichong Xu and Shuohang Wang and Ruochen Xu and Chenguang Zhu}, year={2023}, eprint={2303.16634}, archivePrefix={arXiv}, primaryClass={cs.CL}, url={https://arxiv.org/abs/2303.16634}, }

GEval( eval_config: kiln_ai.datamodel.eval.EvalConfig, run_config: Optional[Annotated[Union[Annotated[kiln_ai.datamodel.run_config.KilnAgentRunConfigProperties, Tag(tag='kiln_agent')], Annotated[kiln_ai.datamodel.run_config.McpRunConfigProperties, Tag(tag='mcp')]], Discriminator(discriminator=<function _get_run_config_type>, custom_error_type=None, custom_error_message=None, custom_error_context=None)]])
100    def __init__(self, eval_config: EvalConfig, run_config: RunConfigProperties | None):
101        if (
102            eval_config.config_type != EvalConfigType.g_eval
103            and eval_config.config_type != EvalConfigType.llm_as_judge
104        ):
105            raise ValueError(
106                f"GEval must be initialized with a GEval or LLM as Judge config_type. Got {eval_config.config_type}"
107            )
108
109        super().__init__(eval_config, run_config)
110
111        self.geval_task = GEvalTask(eval_config)
geval_task
def generate_final_answer_run_description(self, eval_input: str, eval_output: str) -> str:
113    def generate_final_answer_run_description(
114        self, eval_input: str, eval_output: str
115    ) -> str:
116        return f"""The model was given the following input for the task: 
117<eval_data>
118{eval_input}
119</eval_data>
120
121The model produced the following output for the task:
122<eval_data>
123{eval_output}
124</eval_data>
125"""
def generate_ref_ans_run_description(self, eval_input: str, eval_output: str, reference_answer: str) -> str:
127    def generate_ref_ans_run_description(
128        self, eval_input: str, eval_output: str, reference_answer: str
129    ) -> str:
130        return f"""The model was given the following input for the task: 
131<eval_data>
132{eval_input}
133</eval_data>
134
135The model produced the following output for the task:
136<eval_data>
137{eval_output}
138</eval_data>
139
140This is the reference answer:
141<eval_data>
142{reference_answer}
143</eval_data>
144"""
def generate_full_trace_run_description( self, eval_input: str, available_tools: str | None, conversation_history: str) -> str:
146    def generate_full_trace_run_description(
147        self,
148        eval_input: str,
149        available_tools: str | None,
150        conversation_history: str,
151    ) -> str:
152        description = ""
153        description += f"""The model was given the following <user_input> for the <task_description>: 
154<eval_data>
155<user_input>{eval_input}</user_input>
156</eval_data>
157"""
158        # Get properties from spec if available, otherwise from eval.template_properties (for legacy evals)
159        spec = self.eval.associated_spec(readonly=True)
160
161        # Spec uses different keys than legacy eval template_properties
162        if spec:
163            # Spec: tool_use_guidelines, appropriate_tool_use_examples, inappropriate_tool_use_examples
164            tool_use_guidelines = str(spec.properties.get("tool_use_guidelines") or "")
165            appropriate_tool_use_examples = str(
166                spec.properties.get("appropriate_tool_use_examples") or ""
167            )
168            inappropriate_tool_use_examples = str(
169                spec.properties.get("inappropriate_tool_use_examples") or ""
170            )
171            description += f"""The model was given the following <tool_use_guidelines>:
172<eval_data>
173<tool_use_guidelines>
174{tool_use_guidelines}
175</tool_use_guidelines>
176</eval_data>
177"""
178            description += f"""The model was given the following <appropriate_tool_use_examples>:
179<eval_data>
180<appropriate_tool_use_examples>
181{appropriate_tool_use_examples}
182</appropriate_tool_use_examples>
183</eval_data>
184"""
185            description += f"""The model was given the following <inappropriate_tool_use_examples>:
186<eval_data>
187<inappropriate_tool_use_examples>
188{inappropriate_tool_use_examples}
189</inappropriate_tool_use_examples>
190</eval_data>
191"""
192        elif self.eval.template_properties:
193            # Legacy eval: appropriate_tool_use_guidelines, inappropriate_tool_use_guidelines
194            appropriate_tool_use_guidelines = str(
195                self.eval.template_properties.get("appropriate_tool_use_guidelines")
196                or ""
197            )
198            inappropriate_tool_use_guidelines = str(
199                self.eval.template_properties.get("inappropriate_tool_use_guidelines")
200                or ""
201            )
202
203            description += f"""The model was given the following <appropriate_tool_use_guidelines> guidelines: 
204<eval_data>
205<appropriate_tool_use_guidelines>
206{appropriate_tool_use_guidelines}
207</appropriate_tool_use_guidelines>
208</eval_data>
209"""
210            # Only include if it has content since it is optional
211            if inappropriate_tool_use_guidelines:
212                description += f"""The model was given the following <inappropriate_tool_use_guidelines> guidelines: 
213<eval_data>
214<inappropriate_tool_use_guidelines>
215{inappropriate_tool_use_guidelines}
216</inappropriate_tool_use_guidelines>
217</eval_data>
218"""
219
220        if available_tools is not None:
221            if available_tools != "":
222                description += f"""
223This is the list of tools available to the model:
224<eval_data>
225<available_tools>{available_tools}</available_tools>
226</eval_data>
227"""
228            else:
229                description += """
230There were no tools available to the model.
231"""
232
233        description += f"""
234This is the full conversation history for the task run:
235<eval_data>
236<conversation_history>{conversation_history}</conversation_history>
237</eval_data>
238"""
239        return description
async def run_eval( self, task_run: kiln_ai.datamodel.TaskRun, eval_job_item: kiln_ai.datamodel.TaskRun | None = None) -> tuple[typing.Dict[str, float], typing.Optional[typing.Dict[str, str]]]:
241    async def run_eval(
242        self, task_run: TaskRun, eval_job_item: TaskRun | None = None
243    ) -> tuple[EvalScores, Dict[str, str] | None]:
244        """
245        Run this eval on the given task run.
246        """
247
248        model_name, provider = self.model_and_provider()
249
250        # Only fetch logprobs for G-Eval
251        # There are at most 5 valid rating tokens per rating type (five_star being largest), so 10 is more than enough to get to the very very unlikely
252        top_logprobs = (
253            10 if self.eval_config.config_type == EvalConfigType.g_eval else None
254        )
255
256        # We don't expose setting this manually in the UI, so pull a recommended mode from ml_model_list
257        structured_output_mode = default_structured_output_mode_for_model_provider(
258            model_name,
259            provider,
260            default=StructuredOutputMode.json_schema,
261            # G-eval expects JSON, so don't allow function calling modes
262            disallowed_modes=[
263                StructuredOutputMode.function_calling,
264                StructuredOutputMode.function_calling_weak,
265            ],
266        )
267
268        adapter = adapter_for_task(
269            self.geval_task,
270            run_config_properties=KilnAgentRunConfigProperties(
271                model_name=model_name,
272                model_provider_name=provider,
273                # We always use Simple COT for G-Eval and LLM as Judge
274                prompt_id=PromptGenerators.SIMPLE_CHAIN_OF_THOUGHT,
275                structured_output_mode=structured_output_mode,
276            ),
277            base_adapter_config=AdapterConfig(
278                # Don't save this run into the task_runs. It will be saved into an eval_run where it belongs
279                allow_saving=False,
280                top_logprobs=top_logprobs,
281            ),
282        )
283
284        if self.eval.evaluation_data_type == EvalDataType.full_trace:
285            if task_run.trace is None:
286                raise ValueError("Task run trace is required for full trace evaluation")
287
288            available_tools = await EvalUtils.formatted_available_tools_from_task_run(
289                task_run
290            )
291            run_description = self.generate_full_trace_run_description(
292                task_run.input,
293                available_tools,
294                EvalTraceFormatter.trace_to_formatted_conversation_history(
295                    task_run.trace
296                ),
297            )
298
299        elif self.eval.evaluation_data_type == EvalDataType.reference_answer:
300            if eval_job_item is None:
301                raise ValueError(
302                    "Eval job item is required for reference answer evaluation"
303                )
304            run_description = self.generate_ref_ans_run_description(
305                task_run.input, task_run.output.output, eval_job_item.output.output
306            )
307
308        else:  # EvalDataType.final_answer
309            run_description = self.generate_final_answer_run_description(
310                task_run.input, task_run.output.output
311            )
312
313        # We don't need the run, but invoke_returning_run_output() runs validations for us over _run()
314        _, run_output = await adapter.invoke_returning_run_output(run_description)
315
316        if self.eval_config.config_type == EvalConfigType.llm_as_judge:
317            return self.build_llm_as_judge_score(
318                run_output
319            ), run_output.intermediate_outputs
320        else:
321            return self.build_g_eval_score(run_output), run_output.intermediate_outputs

Run this eval on the given task run.

def build_llm_as_judge_score( self, run_output: kiln_ai.adapters.run_output.RunOutput) -> Dict[str, float]:
323    def build_llm_as_judge_score(self, run_output: RunOutput) -> EvalScores:
324        """
325        Build the LLM as Judge score for the given run and run output.
326        """
327        # Convert the output format we asked for (discreet values) to our float scores
328        scores: EvalScores = {}
329        if not isinstance(run_output.output, dict):
330            raise ValueError("LLM as Judge output must be a dictionary")
331
332        for metric, score in run_output.output.items():
333            token_score = self.score_from_token_string(f"{score}")
334            if token_score is None:
335                raise ValueError(
336                    f"No score found for metric: {metric}. The LLM failed to follow the scoring rubric/instructions/schema."
337                )
338            scores[metric] = token_score
339        return scores

Build the LLM as Judge score for the given run and run output.

def build_g_eval_score( self, run_output: kiln_ai.adapters.run_output.RunOutput) -> Dict[str, float]:
341    def build_g_eval_score(self, run_output: RunOutput) -> EvalScores:
342        """
343        Build the G-Eval score for the given run and run output.
344
345        We create a weighted average of each rating using the logprobs.
346
347        @misc{liu2023gevalnlgevaluationusing,
348            title={G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment},
349            author={Yang Liu and Dan Iter and Yichong Xu and Shuohang Wang and Ruochen Xu and Chenguang Zhu},
350            year={2023},
351            eprint={2303.16634},
352            archivePrefix={arXiv},
353            primaryClass={cs.CL},
354            url={https://arxiv.org/abs/2303.16634},
355        }
356        """
357        # We use structured output
358        outputs = run_output.output
359        assert isinstance(outputs, dict)
360
361        # Build raw string output from the logprobs, which is easier to work with than Dict for the next bit
362        raw_output = self.raw_output_from_logprobs(run_output)
363
364        # find the offset the start of each metric in the raw output json
365        metrics: List[str] = list(outputs.keys())
366        metric_offsets = self.metric_offsets(raw_output, metrics)
367
368        final_scores: EvalScores = {}
369        for metric in metrics:
370            score = self.g_eval_single_metric(
371                run_output, metric, metric_offsets, raw_output
372            )
373            if score is None:
374                raise ValueError(
375                    f"No score found for metric: {metric}. The LLM failed to follow the scoring rubric/instructions/schema."
376                )
377            final_scores[metric] = score
378
379        return final_scores

Build the G-Eval score for the given run and run output.

We create a weighted average of each rating using the logprobs.

@misc{liu2023gevalnlgevaluationusing, title={G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment}, author={Yang Liu and Dan Iter and Yichong Xu and Shuohang Wang and Ruochen Xu and Chenguang Zhu}, year={2023}, eprint={2303.16634}, archivePrefix={arXiv}, primaryClass={cs.CL}, url={https://arxiv.org/abs/2303.16634}, }

def g_eval_single_metric( self, run_output: kiln_ai.adapters.run_output.RunOutput, metric: str, metric_offsets: Dict[str, int], raw_output: str) -> float | None:
381    def g_eval_single_metric(
382        self,
383        run_output: RunOutput,
384        metric: str,
385        metric_offsets: Dict[str, int],
386        raw_output: str,
387    ) -> float | None:
388        """
389        Run the G-Eval for a single metric.
390
391        Scan the logprobs for the metric and return the weighted score of the rating token.
392        """
393
394        start_offset, end_offset = self.token_search_range(
395            raw_output, metric, metric_offsets
396        )
397
398        offset = 0
399
400        if (
401            run_output.output_logprobs is None
402            or run_output.output_logprobs.content is None
403        ):
404            raise RuntimeError(
405                "No logprobs found for output - can not calculate g-eval"
406            )
407
408        # scan the tokens in the range, looking for the rating token
409        for _, chat_logprob in enumerate(run_output.output_logprobs.content):
410            if offset >= end_offset:
411                break
412            if offset >= start_offset:
413                score = self.rating_token_to_score(chat_logprob)
414                if score is not None:
415                    return score
416            offset += len(chat_logprob.token)
417
418        return None

Run the G-Eval for a single metric.

Scan the logprobs for the metric and return the weighted score of the rating token.

def raw_output_from_logprobs(self, run_output: kiln_ai.adapters.run_output.RunOutput) -> str:
420    def raw_output_from_logprobs(self, run_output: RunOutput) -> str:
421        """
422        Build the raw output string from the logprobs. Generate from logprobs so it's guaranteed to match the logprobs offsets
423        """
424        if (
425            run_output.output_logprobs is None
426            or run_output.output_logprobs.content is None
427        ):
428            raise RuntimeError(
429                "No logprobs found for output - can not calculate g-eval"
430            )
431
432        raw = ""
433        for chat_logprob in run_output.output_logprobs.content:
434            raw += chat_logprob.token
435        return raw

Build the raw output string from the logprobs. Generate from logprobs so it's guaranteed to match the logprobs offsets

def token_search_range( self, raw_output: str, metric: str, metric_offsets: Dict[str, int]) -> Tuple[int, int]:
437    def token_search_range(
438        self, raw_output: str, metric: str, metric_offsets: Dict[str, int]
439    ) -> Tuple[int, int]:
440        """
441        Find the start and end offsets of the metric in the raw output.
442
443        Start searching after the end of the target metric json entry ("overall_rating":), and before the start of the next metric ("some_other_score").
444        """
445        start_offset = metric_offsets[metric] + len(metric)
446
447        # Find the lowest end offset that is greater than the start offset
448        end_offset = len(raw_output)
449        for v in list(metric_offsets.values()):
450            if v < end_offset and v > start_offset:
451                end_offset = v
452
453        return start_offset, end_offset

Find the start and end offsets of the metric in the raw output.

Start searching after the end of the target metric json entry ("overall_rating":), and before the start of the next metric ("some_other_score").

def rating_token_to_score( self, token_logprob: litellm.types.utils.ChatCompletionTokenLogprob) -> float | None:
455    def rating_token_to_score(
456        self, token_logprob: ChatCompletionTokenLogprob
457    ) -> float | None:
458        """
459        Convert a rating token to a score using weighted average of top logprobs.
460
461        Only includes tokens that have valid scores.
462
463        Some cleanup for upper case, whitespace and quotes. LLMs aren't always consistent.
464        """
465        primary_token_score = self.score_from_token_string(token_logprob.token)
466        # check this is a real rating token, it could just be the ": ", "," or whitespace
467        if primary_token_score is None:
468            return None
469
470        total_score = 0.0
471        total_probability = 0.0
472        top_logprobs_contains_primary_token = False
473
474        # Process all valid scoring tokens from alternatives
475        for top_logprob in token_logprob.top_logprobs:
476            if top_logprob.token == token_logprob.token:
477                top_logprobs_contains_primary_token = True
478            token_score = self.score_from_token_string(top_logprob.token)
479            if token_score is not None:
480                # Convert logprob to probability
481                probability = math.exp(top_logprob.logprob)
482                total_score += token_score * probability
483                total_probability += probability
484
485        # Weird OpenAI 4o bug - sometimes the primary token is included in the top logprobs, sometimes not.
486        # Add the primary token back in if excluded
487        if not top_logprobs_contains_primary_token:
488            if token_logprob.logprob == -9999.0:
489                # Another "bug" - sometimes the logprob is -9999.0. This seems to happen when the rest of the logprobs are tiny probability.
490                total_score += primary_token_score * 1.0
491                total_probability += 1.0
492            else:
493                probability = math.exp(token_logprob.logprob)
494                total_score += primary_token_score * probability
495                total_probability += probability
496
497        if total_probability <= 0.0:
498            raise RuntimeError(
499                f"No valid scoring tokens found for {token_logprob.token}. This should never happen as the token has a valid score (so it must be excluded from top logprobs). Please file a bug if you see this."
500            )
501
502        # Normalize by total probability of valid tokens (LLM may have wanted to generate other non-rating tokens, these shouldn't lower score of rating tokens)
503        weighted_score = total_score / total_probability
504
505        return weighted_score

Convert a rating token to a score using weighted average of top logprobs.

Only includes tokens that have valid scores.

Some cleanup for upper case, whitespace and quotes. LLMs aren't always consistent.

def score_from_token_string(self, token: str) -> float | None:
507    def score_from_token_string(self, token: str) -> float | None:
508        if token in TOKEN_TO_SCORE_MAP:
509            return TOKEN_TO_SCORE_MAP[token]
510
511        # handle more token variations like '"1"' and '"pass"' and ' paSS' and 'PASS'
512        unquoted_token = token.strip().strip('"').lower()
513        if unquoted_token in TOKEN_TO_SCORE_MAP:
514            return TOKEN_TO_SCORE_MAP[unquoted_token]
515
516        # handle numeric tokens like "1.0"
517        try:
518            float_value = float(token)
519            if float_value.is_integer():
520                str_token = str(int(float_value))
521                if str_token in TOKEN_TO_SCORE_MAP:
522                    return TOKEN_TO_SCORE_MAP[str_token]
523        except ValueError:
524            pass
525
526        return None
def metric_offsets(self, raw_output: str, metrics: List[str]) -> Dict[str, int]:
528    def metric_offsets(self, raw_output: str, metrics: List[str]) -> Dict[str, int]:
529        """
530        Find the offset to the start of each metric in the raw output json
531
532        For the example json: `{"overall_rating": 1}` == 1
533
534        should return:
535        {
536            "overall_rating": 1 # it's 1 character into the json string
537        }
538        """
539        metric_offsets: Dict[str, int] = {}
540        for metric in metrics:
541            # the quoted metric name is expected in the json: `{"overall_rating": 1}` == 1
542            metric_name = f'"{metric}"'
543
544            # we expect it exactly once
545            count = raw_output.count(metric_name)
546            if count != 1:
547                raise ValueError(
548                    f"Metric {metric} should appear exactly once in the output. Found {count} times"
549                )
550
551            offset = raw_output.find(metric_name)
552            if offset == -1:
553                raise ValueError(f"Metric {metric} not found in raw output")
554            metric_offsets[metric] = offset
555        return metric_offsets

Find the offset to the start of each metric in the raw output json

For the example json: {"overall_rating": 1} == 1

should return: { "overall_rating": 1 # it's 1 character into the json string }