kiln_ai.adapters.eval.g_eval

View Source

  1import math
  2from typing import Dict, List, Tuple
  3
  4from litellm.types.utils import ChatCompletionTokenLogprob
  5
  6from kiln_ai.adapters.adapter_registry import adapter_for_task
  7from kiln_ai.adapters.eval.base_eval import BaseEval
  8from kiln_ai.adapters.model_adapters.base_adapter import AdapterConfig, RunOutput
  9from kiln_ai.adapters.prompt_builders import PromptGenerators
 10from kiln_ai.datamodel import Project, Task, TaskRun
 11from kiln_ai.datamodel.eval import EvalConfig, EvalConfigType, EvalScores
 12from kiln_ai.datamodel.task import RunConfig
 13
 14# all the tokens we score for, and their float scores.
 15TOKEN_TO_SCORE_MAP: Dict[str, float] = {
 16    "1": 1.0,
 17    "2": 2.0,
 18    "3": 3.0,
 19    "4": 4.0,
 20    "5": 5.0,
 21    "pass": 1.0,
 22    "fail": 0.0,
 23    "critical": -1.0,
 24}
 25
 26
 27class GEvalTask(Task, parent_of={}):
 28    """
 29    Kiln task for executing a G-Eval. Can be run on any Kiln adapter which supports logprobs.
 30
 31    Note G-Eval implements both G-Eval and LLM as Judge as they are very similar.
 32    """
 33
 34    def __init__(self, eval_config: EvalConfig):
 35        tmp_project = Project(name="GEval")
 36
 37        # Build a simple LLM as Judge system instruction
 38        system_instruction = "Your job to evaluate a model's performance on a task. Blocks will be marked with <eval_data> tags.\n"
 39        # Optionally add a short task description
 40        task_description = eval_config.properties.get("task_description", None)
 41        if task_description:
 42            system_instruction += f"\nThe task the model was given is as follows:\n<eval_data>\n{task_description}\n</eval_data>\n"
 43
 44        # Build the COT eval instructions
 45        cot_instructions = "First, think step by step about the model's performance following these evaluation steps:\n\n"
 46        steps = eval_config.properties.get("eval_steps", [])
 47        if not isinstance(steps, list):
 48            raise ValueError("eval_steps must be a list.")
 49        for i, step in enumerate(steps):
 50            cot_instructions += f"{i + 1}) {step}\n"
 51
 52        eval = eval_config.parent_eval()
 53        if not eval:
 54            raise ValueError("Eval config must have a parent eval")
 55
 56        # Build the output schema from the eval's target output scores.
 57        # We restrict the LLM's output scoring schema to discrete scores (pass/fail/critical/1-5) - allow_float_scores=False
 58        # However, the final scores from the evaluator can be a float (see later logprob calculation, which requires discrete token outputs)
 59        output_schema = BaseEval.build_score_schema(eval, allow_float_scores=False)
 60
 61        super().__init__(
 62            name="GEval Task",
 63            parent=tmp_project,
 64            instruction=system_instruction,
 65            thinking_instruction=cot_instructions,
 66            output_json_schema=output_schema,
 67        )
 68
 69
 70class GEval(BaseEval):
 71    """
 72    A evaluator which implements G-Eval and LLM as Judge.
 73
 74    G-Eval is a method of evaluating the quality of a model's output. It is a weighted average of the scores of the tokens in the output. The weights are the log probabilities of the tokens in the output. https://arxiv.org/abs/2303.16634
 75
 76    LLM as Judge is a method of evaluating the quality of a model's output. It simply asks the LLM to score, and uses the returned output (no logprobs needed). Also called direct evaluation.
 77
 78    @misc{liu2023gevalnlgevaluationusing,
 79        title={G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment},
 80        author={Yang Liu and Dan Iter and Yichong Xu and Shuohang Wang and Ruochen Xu and Chenguang Zhu},
 81        year={2023},
 82        eprint={2303.16634},
 83        archivePrefix={arXiv},
 84        primaryClass={cs.CL},
 85        url={https://arxiv.org/abs/2303.16634},
 86    }
 87    """
 88
 89    def __init__(self, eval_config: EvalConfig, run_config: RunConfig | None):
 90        if (
 91            eval_config.config_type != EvalConfigType.g_eval
 92            and eval_config.config_type != EvalConfigType.llm_as_judge
 93        ):
 94            raise ValueError(
 95                f"GEval must be initialized with a GEval or LLM as Judge config_type. Got {eval_config.config_type}"
 96            )
 97
 98        super().__init__(eval_config, run_config)
 99
100        self.geval_task = GEvalTask(eval_config)
101
102    async def run_eval(
103        self, task_run: TaskRun
104    ) -> tuple[EvalScores, Dict[str, str] | None]:
105        """
106        Run this eval on the given task run.
107        """
108
109        model_name, provider = self.model_and_provider()
110
111        # Only fetch logprobs for G-Eval
112        # There are at most 5 valid rating tokens per rating type (five_star being largest), so 10 is more than enough to get to the very very unlikely
113        top_logprobs = (
114            10 if self.eval_config.config_type == EvalConfigType.g_eval else None
115        )
116
117        adapter = adapter_for_task(
118            self.geval_task,
119            model_name,
120            provider,
121            # We always use Simple COT for G-Eval and LLM as Judge
122            prompt_id=PromptGenerators.SIMPLE_CHAIN_OF_THOUGHT,
123            base_adapter_config=AdapterConfig(
124                # Don't save this run into the task_runs. It will be saved into an eval_run where it belongs
125                allow_saving=False,
126                top_logprobs=top_logprobs,
127            ),
128        )
129
130        input = f"""The model was given the following input for the task: 
131<eval_data>
132{task_run.input}
133</eval_data>
134
135The model produced the following output for the task:
136<eval_data>
137{task_run.output}
138</eval_data>
139"""
140
141        # We don't need the run, but invoke_returning_run_output() runs validations for us over _run()
142        _, run_output = await adapter.invoke_returning_run_output(input)
143
144        if self.eval_config.config_type == EvalConfigType.llm_as_judge:
145            return self.build_llm_as_judge_score(
146                run_output
147            ), run_output.intermediate_outputs
148        else:
149            return self.build_g_eval_score(run_output), run_output.intermediate_outputs
150
151    def build_llm_as_judge_score(self, run_output: RunOutput) -> EvalScores:
152        """
153        Build the LLM as Judge score for the given run and run output.
154        """
155        # Convert the output format we asked for (discreet values) to our float scores
156        scores: EvalScores = {}
157        if not isinstance(run_output.output, dict):
158            raise ValueError("LLM as Judge output must be a dictionary")
159
160        for metric, score in run_output.output.items():
161            token_score = self.score_from_token_string(f"{score}")
162            if token_score is None:
163                raise ValueError(
164                    f"No score found for metric: {metric}. The LLM failed to follow the scoring rubric/instructions/schema."
165                )
166            scores[metric] = token_score
167        return scores
168
169    def build_g_eval_score(self, run_output: RunOutput) -> EvalScores:
170        """
171        Build the G-Eval score for the given run and run output.
172
173        We create a weighted average of each rating using the logprobs.
174
175        @misc{liu2023gevalnlgevaluationusing,
176            title={G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment},
177            author={Yang Liu and Dan Iter and Yichong Xu and Shuohang Wang and Ruochen Xu and Chenguang Zhu},
178            year={2023},
179            eprint={2303.16634},
180            archivePrefix={arXiv},
181            primaryClass={cs.CL},
182            url={https://arxiv.org/abs/2303.16634},
183        }
184        """
185        # We use structured output
186        outputs = run_output.output
187        assert isinstance(outputs, dict)
188
189        # Build raw string output from the logprobs, which is easier to work with than Dict for the next bit
190        raw_output = self.raw_output_from_logprobs(run_output)
191
192        # find the offset the start of each metric in the raw output json
193        metrics: List[str] = list(outputs.keys())
194        metric_offsets = self.metric_offsets(raw_output, metrics)
195
196        final_scores: EvalScores = {}
197        for metric in metrics:
198            score = self.g_eval_single_metric(
199                run_output, metric, metric_offsets, raw_output
200            )
201            if score is None:
202                raise ValueError(
203                    f"No score found for metric: {metric}. The LLM failed to follow the scoring rubric/instructions/schema."
204                )
205            final_scores[metric] = score
206
207        return final_scores
208
209    def g_eval_single_metric(
210        self,
211        run_output: RunOutput,
212        metric: str,
213        metric_offsets: Dict[str, int],
214        raw_output: str,
215    ) -> float | None:
216        """
217        Run the G-Eval for a single metric.
218
219        Scan the logprobs for the metric and return the weighted score of the rating token.
220        """
221
222        start_offset, end_offset = self.token_search_range(
223            raw_output, metric, metric_offsets
224        )
225
226        offset = 0
227
228        if (
229            run_output.output_logprobs is None
230            or run_output.output_logprobs.content is None
231        ):
232            raise RuntimeError(
233                "No logprobs found for output - can not calculate g-eval"
234            )
235
236        # scan the tokens in the range, looking for the rating token
237        for _, chat_logprob in enumerate(run_output.output_logprobs.content):
238            if offset >= end_offset:
239                break
240            if offset >= start_offset:
241                score = self.rating_token_to_score(chat_logprob)
242                if score is not None:
243                    return score
244            offset += len(chat_logprob.token)
245
246        return None
247
248    def raw_output_from_logprobs(self, run_output: RunOutput) -> str:
249        """
250        Build the raw output string from the logprobs. Generate from logprobs so it's guaranteed to match the logprobs offsets
251        """
252        if (
253            run_output.output_logprobs is None
254            or run_output.output_logprobs.content is None
255        ):
256            raise RuntimeError(
257                "No logprobs found for output - can not calculate g-eval"
258            )
259
260        raw = ""
261        for chat_logprob in run_output.output_logprobs.content:
262            raw += chat_logprob.token
263        return raw
264
265    def token_search_range(
266        self, raw_output: str, metric: str, metric_offsets: Dict[str, int]
267    ) -> Tuple[int, int]:
268        """
269        Find the start and end offsets of the metric in the raw output.
270
271        Start searching after the end of the target metric json entry ("overall_rating":), and before the start of the next metric ("some_other_score").
272        """
273        start_offset = metric_offsets[metric] + len(metric)
274
275        # Find the lowest end offset that is greater than the start offset
276        end_offset = len(raw_output)
277        for v in list(metric_offsets.values()):
278            if v < end_offset and v > start_offset:
279                end_offset = v
280
281        return start_offset, end_offset
282
283    def rating_token_to_score(
284        self, token_logprob: ChatCompletionTokenLogprob
285    ) -> float | None:
286        """
287        Convert a rating token to a score using weighted average of top logprobs.
288
289        Only includes tokens that have valid scores.
290
291        Some cleanup for upper case, whitespace and quotes. LLMs aren't always consistent.
292        """
293        primary_token_score = self.score_from_token_string(token_logprob.token)
294        # check this is a real rating token, it could just be the ": ", "," or whitespace
295        if not primary_token_score:
296            return None
297
298        total_score = 0.0
299        total_probability = 0.0
300        top_logprobs_contains_primary_token = False
301
302        # Process all valid scoring tokens from alternatives
303        for top_logprob in token_logprob.top_logprobs:
304            if top_logprob.token == token_logprob.token:
305                top_logprobs_contains_primary_token = True
306            token_score = self.score_from_token_string(top_logprob.token)
307            if token_score is not None:
308                # Convert logprob to probability
309                probability = math.exp(top_logprob.logprob)
310                total_score += token_score * probability
311                total_probability += probability
312
313        # Weird OpenAI 4o bug - sometimes the primary token is included in the top logprobs, sometimes not.
314        # Add the primary token back in if excluded
315        if not top_logprobs_contains_primary_token:
316            if token_logprob.logprob == -9999.0:
317                # Another "bug" - sometimes the logprob is -9999.0. This seems to happen when the rest of the logprobs are tiny probability.
318                total_score += primary_token_score * 1.0
319                total_probability += 1.0
320            else:
321                probability = math.exp(token_logprob.logprob)
322                total_score += primary_token_score * probability
323                total_probability += probability
324
325        if total_probability <= 0.0:
326            raise RuntimeError(
327                f"No valid scoring tokens found for {token_logprob.token}. This should never happen as the token has a valid score (so it must be excluded from top logprobs). Please file a bug if you see this."
328            )
329
330        # Normalize by total probability of valid tokens (LLM may have wanted to generate other non-rating tokens, these shouldn't lower score of rating tokens)
331        weighted_score = total_score / total_probability
332
333        return weighted_score
334
335    def score_from_token_string(self, token: str) -> float | None:
336        if token in TOKEN_TO_SCORE_MAP:
337            return TOKEN_TO_SCORE_MAP[token]
338
339        # handle more token variations like '"1"' and '"pass"' and ' paSS' and 'PASS'
340        unquoted_token = token.strip().strip('"').lower()
341        if unquoted_token in TOKEN_TO_SCORE_MAP:
342            return TOKEN_TO_SCORE_MAP[unquoted_token]
343
344        # handle numeric tokens like "1.0"
345        try:
346            float_value = float(token)
347            if float_value.is_integer():
348                str_token = str(int(float_value))
349                if str_token in TOKEN_TO_SCORE_MAP:
350                    return TOKEN_TO_SCORE_MAP[str_token]
351        except ValueError:
352            pass
353
354        return None
355
356    def metric_offsets(self, raw_output: str, metrics: List[str]) -> Dict[str, int]:
357        """
358        Find the offset to the start of each metric in the raw output json
359
360        For the example json: `{"overall_rating": 1}` == 1
361
362        should return:
363        {
364            "overall_rating": 1 # it's 1 character into the json string
365        }
366        """
367        metric_offsets: Dict[str, int] = {}
368        for metric in metrics:
369            # the quoted metric name is expected in the json: `{"overall_rating": 1}` == 1
370            metric_name = f'"{metric}"'
371
372            # we expect it exactly once
373            count = raw_output.count(metric_name)
374            if count != 1:
375                raise ValueError(
376                    f"Metric {metric} should appear exactly once in the output. Found {count} times"
377                )
378
379            offset = raw_output.find(metric_name)
380            if offset == -1:
381                raise ValueError(f"Metric {metric} not found in raw output")
382            metric_offsets[metric] = offset
383        return metric_offsets

TOKEN_TO_SCORE_MAP: Dict[str, float] = {'1': 1.0, '2': 2.0, '3': 3.0, '4': 4.0, '5': 5.0, 'pass': 1.0, 'fail': 0.0, 'critical': -1.0}

class GEvalTask(kiln_ai.datamodel.task.Task): View Source

28class GEvalTask(Task, parent_of={}):
29    """
30    Kiln task for executing a G-Eval. Can be run on any Kiln adapter which supports logprobs.
31
32    Note G-Eval implements both G-Eval and LLM as Judge as they are very similar.
33    """
34
35    def __init__(self, eval_config: EvalConfig):
36        tmp_project = Project(name="GEval")
37
38        # Build a simple LLM as Judge system instruction
39        system_instruction = "Your job to evaluate a model's performance on a task. Blocks will be marked with <eval_data> tags.\n"
40        # Optionally add a short task description
41        task_description = eval_config.properties.get("task_description", None)
42        if task_description:
43            system_instruction += f"\nThe task the model was given is as follows:\n<eval_data>\n{task_description}\n</eval_data>\n"
44
45        # Build the COT eval instructions
46        cot_instructions = "First, think step by step about the model's performance following these evaluation steps:\n\n"
47        steps = eval_config.properties.get("eval_steps", [])
48        if not isinstance(steps, list):
49            raise ValueError("eval_steps must be a list.")
50        for i, step in enumerate(steps):
51            cot_instructions += f"{i + 1}) {step}\n"
52
53        eval = eval_config.parent_eval()
54        if not eval:
55            raise ValueError("Eval config must have a parent eval")
56
57        # Build the output schema from the eval's target output scores.
58        # We restrict the LLM's output scoring schema to discrete scores (pass/fail/critical/1-5) - allow_float_scores=False
59        # However, the final scores from the evaluator can be a float (see later logprob calculation, which requires discrete token outputs)
60        output_schema = BaseEval.build_score_schema(eval, allow_float_scores=False)
61
62        super().__init__(
63            name="GEval Task",
64            parent=tmp_project,
65            instruction=system_instruction,
66            thinking_instruction=cot_instructions,
67            output_json_schema=output_schema,
68        )

Kiln task for executing a G-Eval. Can be run on any Kiln adapter which supports logprobs.

Note G-Eval implements both G-Eval and LLM as Judge as they are very similar.

GEvalTask(eval_config: kiln_ai.datamodel.eval.EvalConfig) View Source

35    def __init__(self, eval_config: EvalConfig):
36        tmp_project = Project(name="GEval")
37
38        # Build a simple LLM as Judge system instruction
39        system_instruction = "Your job to evaluate a model's performance on a task. Blocks will be marked with <eval_data> tags.\n"
40        # Optionally add a short task description
41        task_description = eval_config.properties.get("task_description", None)
42        if task_description:
43            system_instruction += f"\nThe task the model was given is as follows:\n<eval_data>\n{task_description}\n</eval_data>\n"
44
45        # Build the COT eval instructions
46        cot_instructions = "First, think step by step about the model's performance following these evaluation steps:\n\n"
47        steps = eval_config.properties.get("eval_steps", [])
48        if not isinstance(steps, list):
49            raise ValueError("eval_steps must be a list.")
50        for i, step in enumerate(steps):
51            cot_instructions += f"{i + 1}) {step}\n"
52
53        eval = eval_config.parent_eval()
54        if not eval:
55            raise ValueError("Eval config must have a parent eval")
56
57        # Build the output schema from the eval's target output scores.
58        # We restrict the LLM's output scoring schema to discrete scores (pass/fail/critical/1-5) - allow_float_scores=False
59        # However, the final scores from the evaluator can be a float (see later logprob calculation, which requires discrete token outputs)
60        output_schema = BaseEval.build_score_schema(eval, allow_float_scores=False)
61
62        super().__init__(
63            name="GEval Task",
64            parent=tmp_project,
65            instruction=system_instruction,
66            thinking_instruction=cot_instructions,
67            output_json_schema=output_schema,
68        )

Create a new model by parsing and validating input data from keyword arguments.

Raises [ValidationError][pydantic_core.ValidationError] if the input data cannot be validated to form a valid model.

self is explicitly positional-only to allow self as a field name.

model_config = {'validate_assignment': True}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

def model_post_init(self: pydantic.main.BaseModel, context: Any, /) -> None: View Source

122                    def wrapped_model_post_init(self: BaseModel, context: Any, /) -> None:
123                        """We need to both initialize private attributes and call the user-defined model_post_init
124                        method.
125                        """
126                        init_private_attributes(self, context)
127                        original_model_post_init(self, context)

We need to both initialize private attributes and call the user-defined model_post_init method.

kiln_ai.adapters.eval.g_eval

Inherited Members