kiln_ai.adapters.eval.g_eval

  1import math
  2from typing import Dict, List, Tuple
  3
  4from litellm.types.utils import ChatCompletionTokenLogprob
  5
  6from kiln_ai.adapters.adapter_registry import adapter_for_task
  7from kiln_ai.adapters.eval.base_eval import BaseEval
  8from kiln_ai.adapters.eval.eval_utils.eval_trace_formatter import EvalTraceFormatter
  9from kiln_ai.adapters.eval.eval_utils.eval_utils import EvalUtils
 10from kiln_ai.adapters.ml_model_list import (
 11    default_structured_output_mode_for_model_provider,
 12)
 13from kiln_ai.adapters.model_adapters.base_adapter import (
 14    AdapterConfig,
 15    RunOutput,
 16    SkillsDict,
 17)
 18from kiln_ai.adapters.prompt_builders import PromptGenerators
 19from kiln_ai.datamodel import Project, Task, TaskRun
 20from kiln_ai.datamodel.eval import EvalConfig, EvalConfigType, EvalDataType, EvalScores
 21from kiln_ai.datamodel.run_config import KilnAgentRunConfigProperties
 22from kiln_ai.datamodel.task import RunConfigProperties, StructuredOutputMode
 23
 24# all the tokens we score for, and their float scores.
 25TOKEN_TO_SCORE_MAP: Dict[str, float] = {
 26    "1": 1.0,
 27    "2": 2.0,
 28    "3": 3.0,
 29    "4": 4.0,
 30    "5": 5.0,
 31    "pass": 1.0,
 32    "fail": 0.0,
 33    "critical": -1.0,
 34}
 35
 36
 37class GEvalTask(Task, parent_of={}):
 38    """
 39    Kiln task for executing a G-Eval. Can be run on any Kiln adapter which supports logprobs.
 40
 41    Note G-Eval implements both G-Eval and LLM as Judge as they are very similar.
 42    """
 43
 44    def __init__(self, eval_config: EvalConfig):
 45        tmp_project = Project(name="GEval")
 46
 47        # Build a simple LLM as Judge system instruction
 48        system_instruction = "Your job to evaluate a model's performance on a task. Blocks will be marked with <eval_data> tags.\n"
 49        # Optionally add a short task description
 50        task_description = eval_config.properties.get("task_description", None)
 51        if task_description:
 52            system_instruction += f"\nThe task the model was given is as follows:\n<eval_data>\n<task_description>{task_description}</task_description>\n</eval_data>\n"
 53
 54        # Build the COT eval instructions
 55        steps = eval_config.properties.get("eval_steps", [])
 56        if not isinstance(steps, list):
 57            raise ValueError("eval_steps must be a list.")
 58        if len(steps) == 1:
 59            cot_instructions = "First, think step by step about the model's performance following this evaluation step:\n\n"
 60            cot_instructions += f"{steps[0]}\n"
 61        else:
 62            cot_instructions = "First, think step by step about the model's performance following these evaluation steps:\n\n"
 63            for i, step in enumerate(steps):
 64                cot_instructions += f"{i + 1}) {step}\n"
 65
 66        eval = eval_config.parent_eval()
 67        if not eval:
 68            raise ValueError("Eval config must have a parent eval")
 69
 70        # Build the output schema from the eval's target output scores.
 71        # We restrict the LLM's output scoring schema to discrete scores (pass/fail/critical/1-5) - allow_float_scores=False
 72        # However, the final scores from the evaluator can be a float (see later logprob calculation, which requires discrete token outputs)
 73        output_schema = BaseEval.build_score_schema(eval, allow_float_scores=False)
 74
 75        super().__init__(
 76            name="GEval Task",
 77            parent=tmp_project,
 78            instruction=system_instruction,
 79            thinking_instruction=cot_instructions,
 80            output_json_schema=output_schema,
 81        )
 82
 83
 84class GEval(BaseEval):
 85    """
 86    A evaluator which implements G-Eval and LLM as Judge.
 87
 88    G-Eval is a method of evaluating the quality of a model's output. It is a weighted average of the scores of the tokens in the output. The weights are the log probabilities of the tokens in the output. https://arxiv.org/abs/2303.16634
 89
 90    LLM as Judge is a method of evaluating the quality of a model's output. It simply asks the LLM to score, and uses the returned output (no logprobs needed). Also called direct evaluation.
 91
 92    @misc{liu2023gevalnlgevaluationusing,
 93        title={G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment},
 94        author={Yang Liu and Dan Iter and Yichong Xu and Shuohang Wang and Ruochen Xu and Chenguang Zhu},
 95        year={2023},
 96        eprint={2303.16634},
 97        archivePrefix={arXiv},
 98        primaryClass={cs.CL},
 99        url={https://arxiv.org/abs/2303.16634},
100    }
101    """
102
103    def __init__(
104        self,
105        eval_config: EvalConfig,
106        run_config: RunConfigProperties | None,
107        skills: SkillsDict | None = None,
108    ):
109        if (
110            eval_config.config_type != EvalConfigType.g_eval
111            and eval_config.config_type != EvalConfigType.llm_as_judge
112        ):
113            raise ValueError(
114                f"GEval must be initialized with a GEval or LLM as Judge config_type. Got {eval_config.config_type}"
115            )
116
117        super().__init__(eval_config, run_config, skills=skills)
118
119        self.geval_task = GEvalTask(eval_config)
120
121    def generate_final_answer_run_description(
122        self, eval_input: str, eval_output: str
123    ) -> str:
124        return f"""The model was given the following input for the task: 
125<eval_data>
126{eval_input}
127</eval_data>
128
129The model produced the following output for the task:
130<eval_data>
131{eval_output}
132</eval_data>
133"""
134
135    def generate_ref_ans_run_description(
136        self, eval_input: str, eval_output: str, reference_answer: str
137    ) -> str:
138        return f"""The model was given the following input for the task: 
139<eval_data>
140{eval_input}
141</eval_data>
142
143The model produced the following output for the task:
144<eval_data>
145{eval_output}
146</eval_data>
147
148This is the reference answer:
149<eval_data>
150{reference_answer}
151</eval_data>
152"""
153
154    def generate_full_trace_run_description(
155        self,
156        eval_input: str,
157        available_tools: str | None,
158        conversation_history: str,
159    ) -> str:
160        description = ""
161        description += f"""The model was given the following <user_input> for the <task_description>: 
162<eval_data>
163<user_input>{eval_input}</user_input>
164</eval_data>
165"""
166        # Get properties from spec if available, otherwise from eval.template_properties (for legacy evals)
167        spec = self.eval.associated_spec(readonly=True)
168
169        # Spec uses different keys than legacy eval template_properties
170        if spec:
171            # Spec: tool_use_guidelines, appropriate_tool_use_examples, inappropriate_tool_use_examples
172            tool_use_guidelines = str(spec.properties.get("tool_use_guidelines") or "")
173            appropriate_tool_use_examples = str(
174                spec.properties.get("appropriate_tool_use_examples") or ""
175            )
176            inappropriate_tool_use_examples = str(
177                spec.properties.get("inappropriate_tool_use_examples") or ""
178            )
179            description += f"""The model was given the following <tool_use_guidelines>:
180<eval_data>
181<tool_use_guidelines>
182{tool_use_guidelines}
183</tool_use_guidelines>
184</eval_data>
185"""
186            description += f"""The model was given the following <appropriate_tool_use_examples>:
187<eval_data>
188<appropriate_tool_use_examples>
189{appropriate_tool_use_examples}
190</appropriate_tool_use_examples>
191</eval_data>
192"""
193            description += f"""The model was given the following <inappropriate_tool_use_examples>:
194<eval_data>
195<inappropriate_tool_use_examples>
196{inappropriate_tool_use_examples}
197</inappropriate_tool_use_examples>
198</eval_data>
199"""
200        elif self.eval.template_properties:
201            # Legacy eval: appropriate_tool_use_guidelines, inappropriate_tool_use_guidelines
202            appropriate_tool_use_guidelines = str(
203                self.eval.template_properties.get("appropriate_tool_use_guidelines")
204                or ""
205            )
206            inappropriate_tool_use_guidelines = str(
207                self.eval.template_properties.get("inappropriate_tool_use_guidelines")
208                or ""
209            )
210
211            description += f"""The model was given the following <appropriate_tool_use_guidelines> guidelines: 
212<eval_data>
213<appropriate_tool_use_guidelines>
214{appropriate_tool_use_guidelines}
215</appropriate_tool_use_guidelines>
216</eval_data>
217"""
218            # Only include if it has content since it is optional
219            if inappropriate_tool_use_guidelines:
220                description += f"""The model was given the following <inappropriate_tool_use_guidelines> guidelines: 
221<eval_data>
222<inappropriate_tool_use_guidelines>
223{inappropriate_tool_use_guidelines}
224</inappropriate_tool_use_guidelines>
225</eval_data>
226"""
227
228        if available_tools is not None:
229            if available_tools != "":
230                description += f"""
231This is the list of tools available to the model:
232<eval_data>
233<available_tools>{available_tools}</available_tools>
234</eval_data>
235"""
236            else:
237                description += """
238There were no tools available to the model.
239"""
240
241        description += f"""
242This is the full conversation history for the task run:
243<eval_data>
244<conversation_history>{conversation_history}</conversation_history>
245</eval_data>
246"""
247        return description
248
249    async def run_eval(
250        self, task_run: TaskRun, eval_job_item: TaskRun | None = None
251    ) -> tuple[EvalScores, Dict[str, str] | None]:
252        """
253        Run this eval on the given task run.
254        """
255
256        model_name, provider = self.model_and_provider()
257
258        # Only fetch logprobs for G-Eval
259        # There are at most 5 valid rating tokens per rating type (five_star being largest), so 10 is more than enough to get to the very very unlikely
260        top_logprobs = (
261            10 if self.eval_config.config_type == EvalConfigType.g_eval else None
262        )
263
264        # We don't expose setting this manually in the UI, so pull a recommended mode from ml_model_list
265        structured_output_mode = default_structured_output_mode_for_model_provider(
266            model_name,
267            provider,
268            default=StructuredOutputMode.json_schema,
269            # G-eval expects JSON, so don't allow function calling modes
270            disallowed_modes=[
271                StructuredOutputMode.function_calling,
272                StructuredOutputMode.function_calling_weak,
273            ],
274        )
275
276        adapter = adapter_for_task(
277            self.geval_task,
278            run_config_properties=KilnAgentRunConfigProperties(
279                model_name=model_name,
280                model_provider_name=provider,
281                # We always use Simple COT for G-Eval and LLM as Judge
282                prompt_id=PromptGenerators.SIMPLE_CHAIN_OF_THOUGHT,
283                structured_output_mode=structured_output_mode,
284            ),
285            base_adapter_config=AdapterConfig(
286                # Don't save this run into the task_runs. It will be saved into an eval_run where it belongs
287                allow_saving=False,
288                top_logprobs=top_logprobs,
289            ),
290        )
291
292        if self.eval.evaluation_data_type == EvalDataType.full_trace:
293            if task_run.trace is None:
294                raise ValueError("Task run trace is required for full trace evaluation")
295
296            available_tools = await EvalUtils.formatted_available_tools_from_task_run(
297                task_run
298            )
299            run_description = self.generate_full_trace_run_description(
300                task_run.input,
301                available_tools,
302                EvalTraceFormatter.trace_to_formatted_conversation_history(
303                    task_run.trace
304                ),
305            )
306
307        elif self.eval.evaluation_data_type == EvalDataType.reference_answer:
308            if eval_job_item is None:
309                raise ValueError(
310                    "Eval job item is required for reference answer evaluation"
311                )
312            run_description = self.generate_ref_ans_run_description(
313                task_run.input, task_run.output.output, eval_job_item.output.output
314            )
315
316        else:  # EvalDataType.final_answer
317            run_description = self.generate_final_answer_run_description(
318                task_run.input, task_run.output.output
319            )
320
321        # We don't need the run, but invoke_returning_run_output() runs validations for us over _run()
322        _, run_output = await adapter.invoke_returning_run_output(run_description)
323
324        if self.eval_config.config_type == EvalConfigType.llm_as_judge:
325            return self.build_llm_as_judge_score(
326                run_output
327            ), run_output.intermediate_outputs
328        else:
329            return self.build_g_eval_score(run_output), run_output.intermediate_outputs
330
331    def build_llm_as_judge_score(self, run_output: RunOutput) -> EvalScores:
332        """
333        Build the LLM as Judge score for the given run and run output.
334        """
335        # Convert the output format we asked for (discreet values) to our float scores
336        scores: EvalScores = {}
337        if not isinstance(run_output.output, dict):
338            raise ValueError("LLM as Judge output must be a dictionary")
339
340        for metric, score in run_output.output.items():
341            token_score = self.score_from_token_string(f"{score}")
342            if token_score is None:
343                raise ValueError(
344                    f"No score found for metric: {metric}. The LLM failed to follow the scoring rubric/instructions/schema."
345                )
346            scores[metric] = token_score
347        return scores
348
349    def build_g_eval_score(self, run_output: RunOutput) -> EvalScores:
350        """
351        Build the G-Eval score for the given run and run output.
352
353        We create a weighted average of each rating using the logprobs.
354
355        @misc{liu2023gevalnlgevaluationusing,
356            title={G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment},
357            author={Yang Liu and Dan Iter and Yichong Xu and Shuohang Wang and Ruochen Xu and Chenguang Zhu},
358            year={2023},
359            eprint={2303.16634},
360            archivePrefix={arXiv},
361            primaryClass={cs.CL},
362            url={https://arxiv.org/abs/2303.16634},
363        }
364        """
365        # We use structured output
366        outputs = run_output.output
367        assert isinstance(outputs, dict)
368
369        # Build raw string output from the logprobs, which is easier to work with than Dict for the next bit
370        raw_output = self.raw_output_from_logprobs(run_output)
371
372        # find the offset the start of each metric in the raw output json
373        metrics: List[str] = list(outputs.keys())
374        metric_offsets = self.metric_offsets(raw_output, metrics)
375
376        final_scores: EvalScores = {}
377        for metric in metrics:
378            score = self.g_eval_single_metric(
379                run_output, metric, metric_offsets, raw_output
380            )
381            if score is None:
382                raise ValueError(
383                    f"No score found for metric: {metric}. The LLM failed to follow the scoring rubric/instructions/schema."
384                )
385            final_scores[metric] = score
386
387        return final_scores
388
389    def g_eval_single_metric(
390        self,
391        run_output: RunOutput,
392        metric: str,
393        metric_offsets: Dict[str, int],
394        raw_output: str,
395    ) -> float | None:
396        """
397        Run the G-Eval for a single metric.
398
399        Scan the logprobs for the metric and return the weighted score of the rating token.
400        """
401
402        start_offset, end_offset = self.token_search_range(
403            raw_output, metric, metric_offsets
404        )
405
406        offset = 0
407
408        if (
409            run_output.output_logprobs is None
410            or run_output.output_logprobs.content is None
411        ):
412            raise RuntimeError(
413                "No logprobs found for output - can not calculate g-eval"
414            )
415
416        # scan the tokens in the range, looking for the rating token
417        for _, chat_logprob in enumerate(run_output.output_logprobs.content):
418            if offset >= end_offset:
419                break
420            if offset >= start_offset:
421                score = self.rating_token_to_score(chat_logprob)
422                if score is not None:
423                    return score
424            offset += len(chat_logprob.token)
425
426        return None
427
428    def raw_output_from_logprobs(self, run_output: RunOutput) -> str:
429        """
430        Build the raw output string from the logprobs. Generate from logprobs so it's guaranteed to match the logprobs offsets
431        """
432        if (
433            run_output.output_logprobs is None
434            or run_output.output_logprobs.content is None
435        ):
436            raise RuntimeError(
437                "No logprobs found for output - can not calculate g-eval"
438            )
439
440        raw = ""
441        for chat_logprob in run_output.output_logprobs.content:
442            raw += chat_logprob.token
443        return raw
444
445    def token_search_range(
446        self, raw_output: str, metric: str, metric_offsets: Dict[str, int]
447    ) -> Tuple[int, int]:
448        """
449        Find the start and end offsets of the metric in the raw output.
450
451        Start searching after the end of the target metric json entry ("overall_rating":), and before the start of the next metric ("some_other_score").
452        """
453        start_offset = metric_offsets[metric] + len(metric)
454
455        # Find the lowest end offset that is greater than the start offset
456        end_offset = len(raw_output)
457        for v in list(metric_offsets.values()):
458            if v < end_offset and v > start_offset:
459                end_offset = v
460
461        return start_offset, end_offset
462
463    def rating_token_to_score(
464        self, token_logprob: ChatCompletionTokenLogprob
465    ) -> float | None:
466        """
467        Convert a rating token to a score using weighted average of top logprobs.
468
469        Only includes tokens that have valid scores.
470
471        Some cleanup for upper case, whitespace and quotes. LLMs aren't always consistent.
472        """
473        primary_token_score = self.score_from_token_string(token_logprob.token)
474        # check this is a real rating token, it could just be the ": ", "," or whitespace
475        if primary_token_score is None:
476            return None
477
478        total_score = 0.0
479        total_probability = 0.0
480        top_logprobs_contains_primary_token = False
481
482        # Process all valid scoring tokens from alternatives
483        for top_logprob in token_logprob.top_logprobs:
484            if top_logprob.token == token_logprob.token:
485                top_logprobs_contains_primary_token = True
486            token_score = self.score_from_token_string(top_logprob.token)
487            if token_score is not None:
488                # Convert logprob to probability
489                probability = math.exp(top_logprob.logprob)
490                total_score += token_score * probability
491                total_probability += probability
492
493        # Weird OpenAI 4o bug - sometimes the primary token is included in the top logprobs, sometimes not.
494        # Add the primary token back in if excluded
495        if not top_logprobs_contains_primary_token:
496            if token_logprob.logprob == -9999.0:
497                # Another "bug" - sometimes the logprob is -9999.0. This seems to happen when the rest of the logprobs are tiny probability.
498                total_score += primary_token_score * 1.0
499                total_probability += 1.0
500            else:
501                probability = math.exp(token_logprob.logprob)
502                total_score += primary_token_score * probability
503                total_probability += probability
504
505        if total_probability <= 0.0:
506            raise RuntimeError(
507                f"No valid scoring tokens found for {token_logprob.token}. This should never happen as the token has a valid score (so it must be excluded from top logprobs). Please file a bug if you see this."
508            )
509
510        # Normalize by total probability of valid tokens (LLM may have wanted to generate other non-rating tokens, these shouldn't lower score of rating tokens)
511        weighted_score = total_score / total_probability
512
513        return weighted_score
514
515    def score_from_token_string(self, token: str) -> float | None:
516        if token in TOKEN_TO_SCORE_MAP:
517            return TOKEN_TO_SCORE_MAP[token]
518
519        # handle more token variations like '"1"' and '"pass"' and ' paSS' and 'PASS'
520        unquoted_token = token.strip().strip('"').lower()
521        if unquoted_token in TOKEN_TO_SCORE_MAP:
522            return TOKEN_TO_SCORE_MAP[unquoted_token]
523
524        # handle numeric tokens like "1.0"
525        try:
526            float_value = float(token)
527            if float_value.is_integer():
528                str_token = str(int(float_value))
529                if str_token in TOKEN_TO_SCORE_MAP:
530                    return TOKEN_TO_SCORE_MAP[str_token]
531        except ValueError:
532            pass
533
534        return None
535
536    def metric_offsets(self, raw_output: str, metrics: List[str]) -> Dict[str, int]:
537        """
538        Find the offset to the start of each metric in the raw output json
539
540        For the example json: `{"overall_rating": 1}` == 1
541
542        should return:
543        {
544            "overall_rating": 1 # it's 1 character into the json string
545        }
546        """
547        metric_offsets: Dict[str, int] = {}
548        for metric in metrics:
549            # the quoted metric name is expected in the json: `{"overall_rating": 1}` == 1
550            metric_name = f'"{metric}"'
551
552            # we expect it exactly once
553            count = raw_output.count(metric_name)
554            if count != 1:
555                raise ValueError(
556                    f"Metric {metric} should appear exactly once in the output. Found {count} times"
557                )
558
559            offset = raw_output.find(metric_name)
560            if offset == -1:
561                raise ValueError(f"Metric {metric} not found in raw output")
562            metric_offsets[metric] = offset
563        return metric_offsets
TOKEN_TO_SCORE_MAP: Dict[str, float] = {'1': 1.0, '2': 2.0, '3': 3.0, '4': 4.0, '5': 5.0, 'pass': 1.0, 'fail': 0.0, 'critical': -1.0}
class GEvalTask(kiln_ai.datamodel.task.Task):
38class GEvalTask(Task, parent_of={}):
39    """
40    Kiln task for executing a G-Eval. Can be run on any Kiln adapter which supports logprobs.
41
42    Note G-Eval implements both G-Eval and LLM as Judge as they are very similar.
43    """
44
45    def __init__(self, eval_config: EvalConfig):
46        tmp_project = Project(name="GEval")
47
48        # Build a simple LLM as Judge system instruction
49        system_instruction = "Your job to evaluate a model's performance on a task. Blocks will be marked with <eval_data> tags.\n"
50        # Optionally add a short task description
51        task_description = eval_config.properties.get("task_description", None)
52        if task_description:
53            system_instruction += f"\nThe task the model was given is as follows:\n<eval_data>\n<task_description>{task_description}</task_description>\n</eval_data>\n"
54
55        # Build the COT eval instructions
56        steps = eval_config.properties.get("eval_steps", [])
57        if not isinstance(steps, list):
58            raise ValueError("eval_steps must be a list.")
59        if len(steps) == 1:
60            cot_instructions = "First, think step by step about the model's performance following this evaluation step:\n\n"
61            cot_instructions += f"{steps[0]}\n"
62        else:
63            cot_instructions = "First, think step by step about the model's performance following these evaluation steps:\n\n"
64            for i, step in enumerate(steps):
65                cot_instructions += f"{i + 1}) {step}\n"
66
67        eval = eval_config.parent_eval()
68        if not eval:
69            raise ValueError("Eval config must have a parent eval")
70
71        # Build the output schema from the eval's target output scores.
72        # We restrict the LLM's output scoring schema to discrete scores (pass/fail/critical/1-5) - allow_float_scores=False
73        # However, the final scores from the evaluator can be a float (see later logprob calculation, which requires discrete token outputs)
74        output_schema = BaseEval.build_score_schema(eval, allow_float_scores=False)
75
76        super().__init__(
77            name="GEval Task",
78            parent=tmp_project,
79            instruction=system_instruction,
80            thinking_instruction=cot_instructions,
81            output_json_schema=output_schema,
82        )

Kiln task for executing a G-Eval. Can be run on any Kiln adapter which supports logprobs.

Note G-Eval implements both G-Eval and LLM as Judge as they are very similar.

GEvalTask(eval_config: kiln_ai.datamodel.eval.EvalConfig)
45    def __init__(self, eval_config: EvalConfig):
46        tmp_project = Project(name="GEval")
47
48        # Build a simple LLM as Judge system instruction
49        system_instruction = "Your job to evaluate a model's performance on a task. Blocks will be marked with <eval_data> tags.\n"
50        # Optionally add a short task description
51        task_description = eval_config.properties.get("task_description", None)
52        if task_description:
53            system_instruction += f"\nThe task the model was given is as follows:\n<eval_data>\n<task_description>{task_description}</task_description>\n</eval_data>\n"
54
55        # Build the COT eval instructions
56        steps = eval_config.properties.get("eval_steps", [])
57        if not isinstance(steps, list):
58            raise ValueError("eval_steps must be a list.")
59        if len(steps) == 1:
60            cot_instructions = "First, think step by step about the model's performance following this evaluation step:\n\n"
61            cot_instructions += f"{steps[0]}\n"
62        else:
63            cot_instructions = "First, think step by step about the model's performance following these evaluation steps:\n\n"
64            for i, step in enumerate(steps):
65                cot_instructions += f"{i + 1}) {step}\n"
66
67        eval = eval_config.parent_eval()
68        if not eval:
69            raise ValueError("Eval config must have a parent eval")
70
71        # Build the output schema from the eval's target output scores.
72        # We restrict the LLM's output scoring schema to discrete scores (pass/fail/critical/1-5) - allow_float_scores=False
73        # However, the final scores from the evaluator can be a float (see later logprob calculation, which requires discrete token outputs)
74        output_schema = BaseEval.build_score_schema(eval, allow_float_scores=False)
75
76        super().__init__(
77            name="GEval Task",
78            parent=tmp_project,
79            instruction=system_instruction,
80            thinking_instruction=cot_instructions,
81            output_json_schema=output_schema,
82        )

Create a new model by parsing and validating input data from keyword arguments.

Raises [ValidationError][pydantic_core.ValidationError] if the input data cannot be validated to form a valid model.

self is explicitly positional-only to allow self as a field name.

model_config = {'validate_assignment': True}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

def model_post_init(self: pydantic.main.BaseModel, context: Any, /) -> None:
337def init_private_attributes(self: BaseModel, context: Any, /) -> None:
338    """This function is meant to behave like a BaseModel method to initialise private attributes.
339
340    It takes context as an argument since that's what pydantic-core passes when calling it.
341
342    Args:
343        self: The BaseModel instance.
344        context: The context.
345    """
346    if getattr(self, '__pydantic_private__', None) is None:
347        pydantic_private = {}
348        for name, private_attr in self.__private_attributes__.items():
349            default = private_attr.get_default()
350            if default is not PydanticUndefined:
351                pydantic_private[name] = default
352        object_setattr(self, '__pydantic_private__', pydantic_private)

This function is meant to behave like a BaseModel method to initialise private attributes.

It takes context as an argument since that's what pydantic-core passes when calling it.

Args: self: The BaseModel instance. context: The context.

class GEval(kiln_ai.adapters.eval.base_eval.BaseEval):
 85class GEval(BaseEval):
 86    """
 87    A evaluator which implements G-Eval and LLM as Judge.
 88
 89    G-Eval is a method of evaluating the quality of a model's output. It is a weighted average of the scores of the tokens in the output. The weights are the log probabilities of the tokens in the output. https://arxiv.org/abs/2303.16634
 90
 91    LLM as Judge is a method of evaluating the quality of a model's output. It simply asks the LLM to score, and uses the returned output (no logprobs needed). Also called direct evaluation.
 92
 93    @misc{liu2023gevalnlgevaluationusing,
 94        title={G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment},
 95        author={Yang Liu and Dan Iter and Yichong Xu and Shuohang Wang and Ruochen Xu and Chenguang Zhu},
 96        year={2023},
 97        eprint={2303.16634},
 98        archivePrefix={arXiv},
 99        primaryClass={cs.CL},
100        url={https://arxiv.org/abs/2303.16634},
101    }
102    """
103
104    def __init__(
105        self,
106        eval_config: EvalConfig,
107        run_config: RunConfigProperties | None,
108        skills: SkillsDict | None = None,
109    ):
110        if (
111            eval_config.config_type != EvalConfigType.g_eval
112            and eval_config.config_type != EvalConfigType.llm_as_judge
113        ):
114            raise ValueError(
115                f"GEval must be initialized with a GEval or LLM as Judge config_type. Got {eval_config.config_type}"
116            )
117
118        super().__init__(eval_config, run_config, skills=skills)
119
120        self.geval_task = GEvalTask(eval_config)
121
122    def generate_final_answer_run_description(
123        self, eval_input: str, eval_output: str
124    ) -> str:
125        return f"""The model was given the following input for the task: 
126<eval_data>
127{eval_input}
128</eval_data>
129
130The model produced the following output for the task:
131<eval_data>
132{eval_output}
133</eval_data>
134"""
135
136    def generate_ref_ans_run_description(
137        self, eval_input: str, eval_output: str, reference_answer: str
138    ) -> str:
139        return f"""The model was given the following input for the task: 
140<eval_data>
141{eval_input}
142</eval_data>
143
144The model produced the following output for the task:
145<eval_data>
146{eval_output}
147</eval_data>
148
149This is the reference answer:
150<eval_data>
151{reference_answer}
152</eval_data>
153"""
154
155    def generate_full_trace_run_description(
156        self,
157        eval_input: str,
158        available_tools: str | None,
159        conversation_history: str,
160    ) -> str:
161        description = ""
162        description += f"""The model was given the following <user_input> for the <task_description>: 
163<eval_data>
164<user_input>{eval_input}</user_input>
165</eval_data>
166"""
167        # Get properties from spec if available, otherwise from eval.template_properties (for legacy evals)
168        spec = self.eval.associated_spec(readonly=True)
169
170        # Spec uses different keys than legacy eval template_properties
171        if spec:
172            # Spec: tool_use_guidelines, appropriate_tool_use_examples, inappropriate_tool_use_examples
173            tool_use_guidelines = str(spec.properties.get("tool_use_guidelines") or "")
174            appropriate_tool_use_examples = str(
175                spec.properties.get("appropriate_tool_use_examples") or ""
176            )
177            inappropriate_tool_use_examples = str(
178                spec.properties.get("inappropriate_tool_use_examples") or ""
179            )
180            description += f"""The model was given the following <tool_use_guidelines>:
181<eval_data>
182<tool_use_guidelines>
183{tool_use_guidelines}
184</tool_use_guidelines>
185</eval_data>
186"""
187            description += f"""The model was given the following <appropriate_tool_use_examples>:
188<eval_data>
189<appropriate_tool_use_examples>
190{appropriate_tool_use_examples}
191</appropriate_tool_use_examples>
192</eval_data>
193"""
194            description += f"""The model was given the following <inappropriate_tool_use_examples>:
195<eval_data>
196<inappropriate_tool_use_examples>
197{inappropriate_tool_use_examples}
198</inappropriate_tool_use_examples>
199</eval_data>
200"""
201        elif self.eval.template_properties:
202            # Legacy eval: appropriate_tool_use_guidelines, inappropriate_tool_use_guidelines
203            appropriate_tool_use_guidelines = str(
204                self.eval.template_properties.get("appropriate_tool_use_guidelines")
205                or ""
206            )
207            inappropriate_tool_use_guidelines = str(
208                self.eval.template_properties.get("inappropriate_tool_use_guidelines")
209                or ""
210            )
211
212            description += f"""The model was given the following <appropriate_tool_use_guidelines> guidelines: 
213<eval_data>
214<appropriate_tool_use_guidelines>
215{appropriate_tool_use_guidelines}
216</appropriate_tool_use_guidelines>
217</eval_data>
218"""
219            # Only include if it has content since it is optional
220            if inappropriate_tool_use_guidelines:
221                description += f"""The model was given the following <inappropriate_tool_use_guidelines> guidelines: 
222<eval_data>
223<inappropriate_tool_use_guidelines>
224{inappropriate_tool_use_guidelines}
225</inappropriate_tool_use_guidelines>
226</eval_data>
227"""
228
229        if available_tools is not None:
230            if available_tools != "":
231                description += f"""
232This is the list of tools available to the model:
233<eval_data>
234<available_tools>{available_tools}</available_tools>
235</eval_data>
236"""
237            else:
238                description += """
239There were no tools available to the model.
240"""
241
242        description += f"""
243This is the full conversation history for the task run:
244<eval_data>
245<conversation_history>{conversation_history}</conversation_history>
246</eval_data>
247"""
248        return description
249
250    async def run_eval(
251        self, task_run: TaskRun, eval_job_item: TaskRun | None = None
252    ) -> tuple[EvalScores, Dict[str, str] | None]:
253        """
254        Run this eval on the given task run.
255        """
256
257        model_name, provider = self.model_and_provider()
258
259        # Only fetch logprobs for G-Eval
260        # There are at most 5 valid rating tokens per rating type (five_star being largest), so 10 is more than enough to get to the very very unlikely
261        top_logprobs = (
262            10 if self.eval_config.config_type == EvalConfigType.g_eval else None
263        )
264
265        # We don't expose setting this manually in the UI, so pull a recommended mode from ml_model_list
266        structured_output_mode = default_structured_output_mode_for_model_provider(
267            model_name,
268            provider,
269            default=StructuredOutputMode.json_schema,
270            # G-eval expects JSON, so don't allow function calling modes
271            disallowed_modes=[
272                StructuredOutputMode.function_calling,
273                StructuredOutputMode.function_calling_weak,
274            ],
275        )
276
277        adapter = adapter_for_task(
278            self.geval_task,
279            run_config_properties=KilnAgentRunConfigProperties(
280                model_name=model_name,
281                model_provider_name=provider,
282                # We always use Simple COT for G-Eval and LLM as Judge
283                prompt_id=PromptGenerators.SIMPLE_CHAIN_OF_THOUGHT,
284                structured_output_mode=structured_output_mode,
285            ),
286            base_adapter_config=AdapterConfig(
287                # Don't save this run into the task_runs. It will be saved into an eval_run where it belongs
288                allow_saving=False,
289                top_logprobs=top_logprobs,
290            ),
291        )
292
293        if self.eval.evaluation_data_type == EvalDataType.full_trace:
294            if task_run.trace is None:
295                raise ValueError("Task run trace is required for full trace evaluation")
296
297            available_tools = await EvalUtils.formatted_available_tools_from_task_run(
298                task_run
299            )
300            run_description = self.generate_full_trace_run_description(
301                task_run.input,
302                available_tools,
303                EvalTraceFormatter.trace_to_formatted_conversation_history(
304                    task_run.trace
305                ),
306            )
307
308        elif self.eval.evaluation_data_type == EvalDataType.reference_answer:
309            if eval_job_item is None:
310                raise ValueError(
311                    "Eval job item is required for reference answer evaluation"
312                )
313            run_description = self.generate_ref_ans_run_description(
314                task_run.input, task_run.output.output, eval_job_item.output.output
315            )
316
317        else:  # EvalDataType.final_answer
318            run_description = self.generate_final_answer_run_description(
319                task_run.input, task_run.output.output
320            )
321
322        # We don't need the run, but invoke_returning_run_output() runs validations for us over _run()
323        _, run_output = await adapter.invoke_returning_run_output(run_description)
324
325        if self.eval_config.config_type == EvalConfigType.llm_as_judge:
326            return self.build_llm_as_judge_score(
327                run_output
328            ), run_output.intermediate_outputs
329        else:
330            return self.build_g_eval_score(run_output), run_output.intermediate_outputs
331
332    def build_llm_as_judge_score(self, run_output: RunOutput) -> EvalScores:
333        """
334        Build the LLM as Judge score for the given run and run output.
335        """
336        # Convert the output format we asked for (discreet values) to our float scores
337        scores: EvalScores = {}
338        if not isinstance(run_output.output, dict):
339            raise ValueError("LLM as Judge output must be a dictionary")
340
341        for metric, score in run_output.output.items():
342            token_score = self.score_from_token_string(f"{score}")
343            if token_score is None:
344                raise ValueError(
345                    f"No score found for metric: {metric}. The LLM failed to follow the scoring rubric/instructions/schema."
346                )
347            scores[metric] = token_score
348        return scores
349
350    def build_g_eval_score(self, run_output: RunOutput) -> EvalScores:
351        """
352        Build the G-Eval score for the given run and run output.
353
354        We create a weighted average of each rating using the logprobs.
355
356        @misc{liu2023gevalnlgevaluationusing,
357            title={G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment},
358            author={Yang Liu and Dan Iter and Yichong Xu and Shuohang Wang and Ruochen Xu and Chenguang Zhu},
359            year={2023},
360            eprint={2303.16634},
361            archivePrefix={arXiv},
362            primaryClass={cs.CL},
363            url={https://arxiv.org/abs/2303.16634},
364        }
365        """
366        # We use structured output
367        outputs = run_output.output
368        assert isinstance(outputs, dict)
369
370        # Build raw string output from the logprobs, which is easier to work with than Dict for the next bit
371        raw_output = self.raw_output_from_logprobs(run_output)
372
373        # find the offset the start of each metric in the raw output json
374        metrics: List[str] = list(outputs.keys())
375        metric_offsets = self.metric_offsets(raw_output, metrics)
376
377        final_scores: EvalScores = {}
378        for metric in metrics:
379            score = self.g_eval_single_metric(
380                run_output, metric, metric_offsets, raw_output
381            )
382            if score is None:
383                raise ValueError(
384                    f"No score found for metric: {metric}. The LLM failed to follow the scoring rubric/instructions/schema."
385                )
386            final_scores[metric] = score
387
388        return final_scores
389
390    def g_eval_single_metric(
391        self,
392        run_output: RunOutput,
393        metric: str,
394        metric_offsets: Dict[str, int],
395        raw_output: str,
396    ) -> float | None:
397        """
398        Run the G-Eval for a single metric.
399
400        Scan the logprobs for the metric and return the weighted score of the rating token.
401        """
402
403        start_offset, end_offset = self.token_search_range(
404            raw_output, metric, metric_offsets
405        )
406
407        offset = 0
408
409        if (
410            run_output.output_logprobs is None
411            or run_output.output_logprobs.content is None
412        ):
413            raise RuntimeError(
414                "No logprobs found for output - can not calculate g-eval"
415            )
416
417        # scan the tokens in the range, looking for the rating token
418        for _, chat_logprob in enumerate(run_output.output_logprobs.content):
419            if offset >= end_offset:
420                break
421            if offset >= start_offset:
422                score = self.rating_token_to_score(chat_logprob)
423                if score is not None:
424                    return score
425            offset += len(chat_logprob.token)
426
427        return None
428
429    def raw_output_from_logprobs(self, run_output: RunOutput) -> str:
430        """
431        Build the raw output string from the logprobs. Generate from logprobs so it's guaranteed to match the logprobs offsets
432        """
433        if (
434            run_output.output_logprobs is None
435            or run_output.output_logprobs.content is None
436        ):
437            raise RuntimeError(
438                "No logprobs found for output - can not calculate g-eval"
439            )
440
441        raw = ""
442        for chat_logprob in run_output.output_logprobs.content:
443            raw += chat_logprob.token
444        return raw
445
446    def token_search_range(
447        self, raw_output: str, metric: str, metric_offsets: Dict[str, int]
448    ) -> Tuple[int, int]:
449        """
450        Find the start and end offsets of the metric in the raw output.
451
452        Start searching after the end of the target metric json entry ("overall_rating":), and before the start of the next metric ("some_other_score").
453        """
454        start_offset = metric_offsets[metric] + len(metric)
455
456        # Find the lowest end offset that is greater than the start offset
457        end_offset = len(raw_output)
458        for v in list(metric_offsets.values()):
459            if v < end_offset and v > start_offset:
460                end_offset = v
461
462        return start_offset, end_offset
463
464    def rating_token_to_score(
465        self, token_logprob: ChatCompletionTokenLogprob
466    ) -> float | None:
467        """
468        Convert a rating token to a score using weighted average of top logprobs.
469
470        Only includes tokens that have valid scores.
471
472        Some cleanup for upper case, whitespace and quotes. LLMs aren't always consistent.
473        """
474        primary_token_score = self.score_from_token_string(token_logprob.token)
475        # check this is a real rating token, it could just be the ": ", "," or whitespace
476        if primary_token_score is None:
477            return None
478
479        total_score = 0.0
480        total_probability = 0.0
481        top_logprobs_contains_primary_token = False
482
483        # Process all valid scoring tokens from alternatives
484        for top_logprob in token_logprob.top_logprobs:
485            if top_logprob.token == token_logprob.token:
486                top_logprobs_contains_primary_token = True
487            token_score = self.score_from_token_string(top_logprob.token)
488            if token_score is not None:
489                # Convert logprob to probability
490                probability = math.exp(top_logprob.logprob)
491                total_score += token_score * probability
492                total_probability += probability
493
494        # Weird OpenAI 4o bug - sometimes the primary token is included in the top logprobs, sometimes not.
495        # Add the primary token back in if excluded
496        if not top_logprobs_contains_primary_token:
497            if token_logprob.logprob == -9999.0:
498                # Another "bug" - sometimes the logprob is -9999.0. This seems to happen when the rest of the logprobs are tiny probability.
499                total_score += primary_token_score * 1.0
500                total_probability += 1.0
501            else:
502                probability = math.exp(token_logprob.logprob)
503                total_score += primary_token_score * probability
504                total_probability += probability
505
506        if total_probability <= 0.0:
507            raise RuntimeError(
508                f"No valid scoring tokens found for {token_logprob.token}. This should never happen as the token has a valid score (so it must be excluded from top logprobs). Please file a bug if you see this."
509            )
510
511        # Normalize by total probability of valid tokens (LLM may have wanted to generate other non-rating tokens, these shouldn't lower score of rating tokens)
512        weighted_score = total_score / total_probability
513
514        return weighted_score
515
516    def score_from_token_string(self, token: str) -> float | None:
517        if token in TOKEN_TO_SCORE_MAP:
518            return TOKEN_TO_SCORE_MAP[token]
519
520        # handle more token variations like '"1"' and '"pass"' and ' paSS' and 'PASS'
521        unquoted_token = token.strip().strip('"').lower()
522        if unquoted_token in TOKEN_TO_SCORE_MAP:
523            return TOKEN_TO_SCORE_MAP[unquoted_token]
524
525        # handle numeric tokens like "1.0"
526        try:
527            float_value = float(token)
528            if float_value.is_integer():
529                str_token = str(int(float_value))
530                if str_token in TOKEN_TO_SCORE_MAP:
531                    return TOKEN_TO_SCORE_MAP[str_token]
532        except ValueError:
533            pass
534
535        return None
536
537    def metric_offsets(self, raw_output: str, metrics: List[str]) -> Dict[str, int]:
538        """
539        Find the offset to the start of each metric in the raw output json
540
541        For the example json: `{"overall_rating": 1}` == 1
542
543        should return:
544        {
545            "overall_rating": 1 # it's 1 character into the json string
546        }
547        """
548        metric_offsets: Dict[str, int] = {}
549        for metric in metrics:
550            # the quoted metric name is expected in the json: `{"overall_rating": 1}` == 1
551            metric_name = f'"{metric}"'
552
553            # we expect it exactly once
554            count = raw_output.count(metric_name)
555            if count != 1:
556                raise ValueError(
557                    f"Metric {metric} should appear exactly once in the output. Found {count} times"
558                )
559
560            offset = raw_output.find(metric_name)
561            if offset == -1:
562                raise ValueError(f"Metric {metric} not found in raw output")
563            metric_offsets[metric] = offset
564        return metric_offsets

A evaluator which implements G-Eval and LLM as Judge.

G-Eval is a method of evaluating the quality of a model's output. It is a weighted average of the scores of the tokens in the output. The weights are the log probabilities of the tokens in the output. https://arxiv.org/abs/2303.16634

LLM as Judge is a method of evaluating the quality of a model's output. It simply asks the LLM to score, and uses the returned output (no logprobs needed). Also called direct evaluation.

@misc{liu2023gevalnlgevaluationusing, title={G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment}, author={Yang Liu and Dan Iter and Yichong Xu and Shuohang Wang and Ruochen Xu and Chenguang Zhu}, year={2023}, eprint={2303.16634}, archivePrefix={arXiv}, primaryClass={cs.CL}, url={https://arxiv.org/abs/2303.16634}, }

GEval( eval_config: kiln_ai.datamodel.eval.EvalConfig, run_config: Optional[Annotated[Union[Annotated[kiln_ai.datamodel.run_config.KilnAgentRunConfigProperties, Tag(tag='kiln_agent')], Annotated[kiln_ai.datamodel.run_config.McpRunConfigProperties, Tag(tag='mcp')]], Discriminator(discriminator=<function _get_run_config_type>, custom_error_type=None, custom_error_message=None, custom_error_context=None)]], skills: Optional[Dict[str, kiln_ai.datamodel.Skill]] = None)
104    def __init__(
105        self,
106        eval_config: EvalConfig,
107        run_config: RunConfigProperties | None,
108        skills: SkillsDict | None = None,
109    ):
110        if (
111            eval_config.config_type != EvalConfigType.g_eval
112            and eval_config.config_type != EvalConfigType.llm_as_judge
113        ):
114            raise ValueError(
115                f"GEval must be initialized with a GEval or LLM as Judge config_type. Got {eval_config.config_type}"
116            )
117
118        super().__init__(eval_config, run_config, skills=skills)
119
120        self.geval_task = GEvalTask(eval_config)
geval_task
def generate_final_answer_run_description(self, eval_input: str, eval_output: str) -> str:
122    def generate_final_answer_run_description(
123        self, eval_input: str, eval_output: str
124    ) -> str:
125        return f"""The model was given the following input for the task: 
126<eval_data>
127{eval_input}
128</eval_data>
129
130The model produced the following output for the task:
131<eval_data>
132{eval_output}
133</eval_data>
134"""
def generate_ref_ans_run_description(self, eval_input: str, eval_output: str, reference_answer: str) -> str:
136    def generate_ref_ans_run_description(
137        self, eval_input: str, eval_output: str, reference_answer: str
138    ) -> str:
139        return f"""The model was given the following input for the task: 
140<eval_data>
141{eval_input}
142</eval_data>
143
144The model produced the following output for the task:
145<eval_data>
146{eval_output}
147</eval_data>
148
149This is the reference answer:
150<eval_data>
151{reference_answer}
152</eval_data>
153"""
def generate_full_trace_run_description( self, eval_input: str, available_tools: str | None, conversation_history: str) -> str:
155    def generate_full_trace_run_description(
156        self,
157        eval_input: str,
158        available_tools: str | None,
159        conversation_history: str,
160    ) -> str:
161        description = ""
162        description += f"""The model was given the following <user_input> for the <task_description>: 
163<eval_data>
164<user_input>{eval_input}</user_input>
165</eval_data>
166"""
167        # Get properties from spec if available, otherwise from eval.template_properties (for legacy evals)
168        spec = self.eval.associated_spec(readonly=True)
169
170        # Spec uses different keys than legacy eval template_properties
171        if spec:
172            # Spec: tool_use_guidelines, appropriate_tool_use_examples, inappropriate_tool_use_examples
173            tool_use_guidelines = str(spec.properties.get("tool_use_guidelines") or "")
174            appropriate_tool_use_examples = str(
175                spec.properties.get("appropriate_tool_use_examples") or ""
176            )
177            inappropriate_tool_use_examples = str(
178                spec.properties.get("inappropriate_tool_use_examples") or ""
179            )
180            description += f"""The model was given the following <tool_use_guidelines>:
181<eval_data>
182<tool_use_guidelines>
183{tool_use_guidelines}
184</tool_use_guidelines>
185</eval_data>
186"""
187            description += f"""The model was given the following <appropriate_tool_use_examples>:
188<eval_data>
189<appropriate_tool_use_examples>
190{appropriate_tool_use_examples}
191</appropriate_tool_use_examples>
192</eval_data>
193"""
194            description += f"""The model was given the following <inappropriate_tool_use_examples>:
195<eval_data>
196<inappropriate_tool_use_examples>
197{inappropriate_tool_use_examples}
198</inappropriate_tool_use_examples>
199</eval_data>
200"""
201        elif self.eval.template_properties:
202            # Legacy eval: appropriate_tool_use_guidelines, inappropriate_tool_use_guidelines
203            appropriate_tool_use_guidelines = str(
204                self.eval.template_properties.get("appropriate_tool_use_guidelines")
205                or ""
206            )
207            inappropriate_tool_use_guidelines = str(
208                self.eval.template_properties.get("inappropriate_tool_use_guidelines")
209                or ""
210            )
211
212            description += f"""The model was given the following <appropriate_tool_use_guidelines> guidelines: 
213<eval_data>
214<appropriate_tool_use_guidelines>
215{appropriate_tool_use_guidelines}
216</appropriate_tool_use_guidelines>
217</eval_data>
218"""
219            # Only include if it has content since it is optional
220            if inappropriate_tool_use_guidelines:
221                description += f"""The model was given the following <inappropriate_tool_use_guidelines> guidelines: 
222<eval_data>
223<inappropriate_tool_use_guidelines>
224{inappropriate_tool_use_guidelines}
225</inappropriate_tool_use_guidelines>
226</eval_data>
227"""
228
229        if available_tools is not None:
230            if available_tools != "":
231                description += f"""
232This is the list of tools available to the model:
233<eval_data>
234<available_tools>{available_tools}</available_tools>
235</eval_data>
236"""
237            else:
238                description += """
239There were no tools available to the model.
240"""
241
242        description += f"""
243This is the full conversation history for the task run:
244<eval_data>
245<conversation_history>{conversation_history}</conversation_history>
246</eval_data>
247"""
248        return description
async def run_eval( self, task_run: kiln_ai.datamodel.TaskRun, eval_job_item: kiln_ai.datamodel.TaskRun | None = None) -> tuple[typing.Dict[str, float], typing.Optional[typing.Dict[str, str]]]:
250    async def run_eval(
251        self, task_run: TaskRun, eval_job_item: TaskRun | None = None
252    ) -> tuple[EvalScores, Dict[str, str] | None]:
253        """
254        Run this eval on the given task run.
255        """
256
257        model_name, provider = self.model_and_provider()
258
259        # Only fetch logprobs for G-Eval
260        # There are at most 5 valid rating tokens per rating type (five_star being largest), so 10 is more than enough to get to the very very unlikely
261        top_logprobs = (
262            10 if self.eval_config.config_type == EvalConfigType.g_eval else None
263        )
264
265        # We don't expose setting this manually in the UI, so pull a recommended mode from ml_model_list
266        structured_output_mode = default_structured_output_mode_for_model_provider(
267            model_name,
268            provider,
269            default=StructuredOutputMode.json_schema,
270            # G-eval expects JSON, so don't allow function calling modes
271            disallowed_modes=[
272                StructuredOutputMode.function_calling,
273                StructuredOutputMode.function_calling_weak,
274            ],
275        )
276
277        adapter = adapter_for_task(
278            self.geval_task,
279            run_config_properties=KilnAgentRunConfigProperties(
280                model_name=model_name,
281                model_provider_name=provider,
282                # We always use Simple COT for G-Eval and LLM as Judge
283                prompt_id=PromptGenerators.SIMPLE_CHAIN_OF_THOUGHT,
284                structured_output_mode=structured_output_mode,
285            ),
286            base_adapter_config=AdapterConfig(
287                # Don't save this run into the task_runs. It will be saved into an eval_run where it belongs
288                allow_saving=False,
289                top_logprobs=top_logprobs,
290            ),
291        )
292
293        if self.eval.evaluation_data_type == EvalDataType.full_trace:
294            if task_run.trace is None:
295                raise ValueError("Task run trace is required for full trace evaluation")
296
297            available_tools = await EvalUtils.formatted_available_tools_from_task_run(
298                task_run
299            )
300            run_description = self.generate_full_trace_run_description(
301                task_run.input,
302                available_tools,
303                EvalTraceFormatter.trace_to_formatted_conversation_history(
304                    task_run.trace
305                ),
306            )
307
308        elif self.eval.evaluation_data_type == EvalDataType.reference_answer:
309            if eval_job_item is None:
310                raise ValueError(
311                    "Eval job item is required for reference answer evaluation"
312                )
313            run_description = self.generate_ref_ans_run_description(
314                task_run.input, task_run.output.output, eval_job_item.output.output
315            )
316
317        else:  # EvalDataType.final_answer
318            run_description = self.generate_final_answer_run_description(
319                task_run.input, task_run.output.output
320            )
321
322        # We don't need the run, but invoke_returning_run_output() runs validations for us over _run()
323        _, run_output = await adapter.invoke_returning_run_output(run_description)
324
325        if self.eval_config.config_type == EvalConfigType.llm_as_judge:
326            return self.build_llm_as_judge_score(
327                run_output
328            ), run_output.intermediate_outputs
329        else:
330            return self.build_g_eval_score(run_output), run_output.intermediate_outputs

Run this eval on the given task run.

def build_llm_as_judge_score( self, run_output: kiln_ai.adapters.run_output.RunOutput) -> Dict[str, float]:
332    def build_llm_as_judge_score(self, run_output: RunOutput) -> EvalScores:
333        """
334        Build the LLM as Judge score for the given run and run output.
335        """
336        # Convert the output format we asked for (discreet values) to our float scores
337        scores: EvalScores = {}
338        if not isinstance(run_output.output, dict):
339            raise ValueError("LLM as Judge output must be a dictionary")
340
341        for metric, score in run_output.output.items():
342            token_score = self.score_from_token_string(f"{score}")
343            if token_score is None:
344                raise ValueError(
345                    f"No score found for metric: {metric}. The LLM failed to follow the scoring rubric/instructions/schema."
346                )
347            scores[metric] = token_score
348        return scores

Build the LLM as Judge score for the given run and run output.

def build_g_eval_score( self, run_output: kiln_ai.adapters.run_output.RunOutput) -> Dict[str, float]:
350    def build_g_eval_score(self, run_output: RunOutput) -> EvalScores:
351        """
352        Build the G-Eval score for the given run and run output.
353
354        We create a weighted average of each rating using the logprobs.
355
356        @misc{liu2023gevalnlgevaluationusing,
357            title={G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment},
358            author={Yang Liu and Dan Iter and Yichong Xu and Shuohang Wang and Ruochen Xu and Chenguang Zhu},
359            year={2023},
360            eprint={2303.16634},
361            archivePrefix={arXiv},
362            primaryClass={cs.CL},
363            url={https://arxiv.org/abs/2303.16634},
364        }
365        """
366        # We use structured output
367        outputs = run_output.output
368        assert isinstance(outputs, dict)
369
370        # Build raw string output from the logprobs, which is easier to work with than Dict for the next bit
371        raw_output = self.raw_output_from_logprobs(run_output)
372
373        # find the offset the start of each metric in the raw output json
374        metrics: List[str] = list(outputs.keys())
375        metric_offsets = self.metric_offsets(raw_output, metrics)
376
377        final_scores: EvalScores = {}
378        for metric in metrics:
379            score = self.g_eval_single_metric(
380                run_output, metric, metric_offsets, raw_output
381            )
382            if score is None:
383                raise ValueError(
384                    f"No score found for metric: {metric}. The LLM failed to follow the scoring rubric/instructions/schema."
385                )
386            final_scores[metric] = score
387
388        return final_scores

Build the G-Eval score for the given run and run output.

We create a weighted average of each rating using the logprobs.

@misc{liu2023gevalnlgevaluationusing, title={G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment}, author={Yang Liu and Dan Iter and Yichong Xu and Shuohang Wang and Ruochen Xu and Chenguang Zhu}, year={2023}, eprint={2303.16634}, archivePrefix={arXiv}, primaryClass={cs.CL}, url={https://arxiv.org/abs/2303.16634}, }

def g_eval_single_metric( self, run_output: kiln_ai.adapters.run_output.RunOutput, metric: str, metric_offsets: Dict[str, int], raw_output: str) -> float | None:
390    def g_eval_single_metric(
391        self,
392        run_output: RunOutput,
393        metric: str,
394        metric_offsets: Dict[str, int],
395        raw_output: str,
396    ) -> float | None:
397        """
398        Run the G-Eval for a single metric.
399
400        Scan the logprobs for the metric and return the weighted score of the rating token.
401        """
402
403        start_offset, end_offset = self.token_search_range(
404            raw_output, metric, metric_offsets
405        )
406
407        offset = 0
408
409        if (
410            run_output.output_logprobs is None
411            or run_output.output_logprobs.content is None
412        ):
413            raise RuntimeError(
414                "No logprobs found for output - can not calculate g-eval"
415            )
416
417        # scan the tokens in the range, looking for the rating token
418        for _, chat_logprob in enumerate(run_output.output_logprobs.content):
419            if offset >= end_offset:
420                break
421            if offset >= start_offset:
422                score = self.rating_token_to_score(chat_logprob)
423                if score is not None:
424                    return score
425            offset += len(chat_logprob.token)
426
427        return None

Run the G-Eval for a single metric.

Scan the logprobs for the metric and return the weighted score of the rating token.

def raw_output_from_logprobs(self, run_output: kiln_ai.adapters.run_output.RunOutput) -> str:
429    def raw_output_from_logprobs(self, run_output: RunOutput) -> str:
430        """
431        Build the raw output string from the logprobs. Generate from logprobs so it's guaranteed to match the logprobs offsets
432        """
433        if (
434            run_output.output_logprobs is None
435            or run_output.output_logprobs.content is None
436        ):
437            raise RuntimeError(
438                "No logprobs found for output - can not calculate g-eval"
439            )
440
441        raw = ""
442        for chat_logprob in run_output.output_logprobs.content:
443            raw += chat_logprob.token
444        return raw

Build the raw output string from the logprobs. Generate from logprobs so it's guaranteed to match the logprobs offsets

def token_search_range( self, raw_output: str, metric: str, metric_offsets: Dict[str, int]) -> Tuple[int, int]:
446    def token_search_range(
447        self, raw_output: str, metric: str, metric_offsets: Dict[str, int]
448    ) -> Tuple[int, int]:
449        """
450        Find the start and end offsets of the metric in the raw output.
451
452        Start searching after the end of the target metric json entry ("overall_rating":), and before the start of the next metric ("some_other_score").
453        """
454        start_offset = metric_offsets[metric] + len(metric)
455
456        # Find the lowest end offset that is greater than the start offset
457        end_offset = len(raw_output)
458        for v in list(metric_offsets.values()):
459            if v < end_offset and v > start_offset:
460                end_offset = v
461
462        return start_offset, end_offset

Find the start and end offsets of the metric in the raw output.

Start searching after the end of the target metric json entry ("overall_rating":), and before the start of the next metric ("some_other_score").

def rating_token_to_score( self, token_logprob: litellm.types.utils.ChatCompletionTokenLogprob) -> float | None:
464    def rating_token_to_score(
465        self, token_logprob: ChatCompletionTokenLogprob
466    ) -> float | None:
467        """
468        Convert a rating token to a score using weighted average of top logprobs.
469
470        Only includes tokens that have valid scores.
471
472        Some cleanup for upper case, whitespace and quotes. LLMs aren't always consistent.
473        """
474        primary_token_score = self.score_from_token_string(token_logprob.token)
475        # check this is a real rating token, it could just be the ": ", "," or whitespace
476        if primary_token_score is None:
477            return None
478
479        total_score = 0.0
480        total_probability = 0.0
481        top_logprobs_contains_primary_token = False
482
483        # Process all valid scoring tokens from alternatives
484        for top_logprob in token_logprob.top_logprobs:
485            if top_logprob.token == token_logprob.token:
486                top_logprobs_contains_primary_token = True
487            token_score = self.score_from_token_string(top_logprob.token)
488            if token_score is not None:
489                # Convert logprob to probability
490                probability = math.exp(top_logprob.logprob)
491                total_score += token_score * probability
492                total_probability += probability
493
494        # Weird OpenAI 4o bug - sometimes the primary token is included in the top logprobs, sometimes not.
495        # Add the primary token back in if excluded
496        if not top_logprobs_contains_primary_token:
497            if token_logprob.logprob == -9999.0:
498                # Another "bug" - sometimes the logprob is -9999.0. This seems to happen when the rest of the logprobs are tiny probability.
499                total_score += primary_token_score * 1.0
500                total_probability += 1.0
501            else:
502                probability = math.exp(token_logprob.logprob)
503                total_score += primary_token_score * probability
504                total_probability += probability
505
506        if total_probability <= 0.0:
507            raise RuntimeError(
508                f"No valid scoring tokens found for {token_logprob.token}. This should never happen as the token has a valid score (so it must be excluded from top logprobs). Please file a bug if you see this."
509            )
510
511        # Normalize by total probability of valid tokens (LLM may have wanted to generate other non-rating tokens, these shouldn't lower score of rating tokens)
512        weighted_score = total_score / total_probability
513
514        return weighted_score

Convert a rating token to a score using weighted average of top logprobs.

Only includes tokens that have valid scores.

Some cleanup for upper case, whitespace and quotes. LLMs aren't always consistent.

def score_from_token_string(self, token: str) -> float | None:
516    def score_from_token_string(self, token: str) -> float | None:
517        if token in TOKEN_TO_SCORE_MAP:
518            return TOKEN_TO_SCORE_MAP[token]
519
520        # handle more token variations like '"1"' and '"pass"' and ' paSS' and 'PASS'
521        unquoted_token = token.strip().strip('"').lower()
522        if unquoted_token in TOKEN_TO_SCORE_MAP:
523            return TOKEN_TO_SCORE_MAP[unquoted_token]
524
525        # handle numeric tokens like "1.0"
526        try:
527            float_value = float(token)
528            if float_value.is_integer():
529                str_token = str(int(float_value))
530                if str_token in TOKEN_TO_SCORE_MAP:
531                    return TOKEN_TO_SCORE_MAP[str_token]
532        except ValueError:
533            pass
534
535        return None
def metric_offsets(self, raw_output: str, metrics: List[str]) -> Dict[str, int]:
537    def metric_offsets(self, raw_output: str, metrics: List[str]) -> Dict[str, int]:
538        """
539        Find the offset to the start of each metric in the raw output json
540
541        For the example json: `{"overall_rating": 1}` == 1
542
543        should return:
544        {
545            "overall_rating": 1 # it's 1 character into the json string
546        }
547        """
548        metric_offsets: Dict[str, int] = {}
549        for metric in metrics:
550            # the quoted metric name is expected in the json: `{"overall_rating": 1}` == 1
551            metric_name = f'"{metric}"'
552
553            # we expect it exactly once
554            count = raw_output.count(metric_name)
555            if count != 1:
556                raise ValueError(
557                    f"Metric {metric} should appear exactly once in the output. Found {count} times"
558                )
559
560            offset = raw_output.find(metric_name)
561            if offset == -1:
562                raise ValueError(f"Metric {metric} not found in raw output")
563            metric_offsets[metric] = offset
564        return metric_offsets

Find the offset to the start of each metric in the raw output json

For the example json: {"overall_rating": 1} == 1

should return: { "overall_rating": 1 # it's 1 character into the json string }