kiln_ai.adapters.eval.g_eval
1import math 2from typing import Dict, List, Tuple 3 4from litellm.types.utils import ChatCompletionTokenLogprob 5 6from kiln_ai.adapters.adapter_registry import adapter_for_task 7from kiln_ai.adapters.eval.base_eval import BaseEval 8from kiln_ai.adapters.eval.eval_utils.eval_trace_formatter import EvalTraceFormatter 9from kiln_ai.adapters.eval.eval_utils.eval_utils import EvalUtils 10from kiln_ai.adapters.ml_model_list import ( 11 default_structured_output_mode_for_model_provider, 12) 13from kiln_ai.adapters.model_adapters.base_adapter import AdapterConfig, RunOutput 14from kiln_ai.adapters.prompt_builders import PromptGenerators 15from kiln_ai.datamodel import Project, Task, TaskRun 16from kiln_ai.datamodel.eval import EvalConfig, EvalConfigType, EvalDataType, EvalScores 17from kiln_ai.datamodel.run_config import KilnAgentRunConfigProperties 18from kiln_ai.datamodel.task import RunConfigProperties, StructuredOutputMode 19 20# all the tokens we score for, and their float scores. 21TOKEN_TO_SCORE_MAP: Dict[str, float] = { 22 "1": 1.0, 23 "2": 2.0, 24 "3": 3.0, 25 "4": 4.0, 26 "5": 5.0, 27 "pass": 1.0, 28 "fail": 0.0, 29 "critical": -1.0, 30} 31 32 33class GEvalTask(Task, parent_of={}): 34 """ 35 Kiln task for executing a G-Eval. Can be run on any Kiln adapter which supports logprobs. 36 37 Note G-Eval implements both G-Eval and LLM as Judge as they are very similar. 38 """ 39 40 def __init__(self, eval_config: EvalConfig): 41 tmp_project = Project(name="GEval") 42 43 # Build a simple LLM as Judge system instruction 44 system_instruction = "Your job to evaluate a model's performance on a task. Blocks will be marked with <eval_data> tags.\n" 45 # Optionally add a short task description 46 task_description = eval_config.properties.get("task_description", None) 47 if task_description: 48 system_instruction += f"\nThe task the model was given is as follows:\n<eval_data>\n<task_description>{task_description}</task_description>\n</eval_data>\n" 49 50 # Build the COT eval instructions 51 steps = eval_config.properties.get("eval_steps", []) 52 if not isinstance(steps, list): 53 raise ValueError("eval_steps must be a list.") 54 if len(steps) == 1: 55 cot_instructions = "First, think step by step about the model's performance following this evaluation step:\n\n" 56 cot_instructions += f"{steps[0]}\n" 57 else: 58 cot_instructions = "First, think step by step about the model's performance following these evaluation steps:\n\n" 59 for i, step in enumerate(steps): 60 cot_instructions += f"{i + 1}) {step}\n" 61 62 eval = eval_config.parent_eval() 63 if not eval: 64 raise ValueError("Eval config must have a parent eval") 65 66 # Build the output schema from the eval's target output scores. 67 # We restrict the LLM's output scoring schema to discrete scores (pass/fail/critical/1-5) - allow_float_scores=False 68 # However, the final scores from the evaluator can be a float (see later logprob calculation, which requires discrete token outputs) 69 output_schema = BaseEval.build_score_schema(eval, allow_float_scores=False) 70 71 super().__init__( 72 name="GEval Task", 73 parent=tmp_project, 74 instruction=system_instruction, 75 thinking_instruction=cot_instructions, 76 output_json_schema=output_schema, 77 ) 78 79 80class GEval(BaseEval): 81 """ 82 A evaluator which implements G-Eval and LLM as Judge. 83 84 G-Eval is a method of evaluating the quality of a model's output. It is a weighted average of the scores of the tokens in the output. The weights are the log probabilities of the tokens in the output. https://arxiv.org/abs/2303.16634 85 86 LLM as Judge is a method of evaluating the quality of a model's output. It simply asks the LLM to score, and uses the returned output (no logprobs needed). Also called direct evaluation. 87 88 @misc{liu2023gevalnlgevaluationusing, 89 title={G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment}, 90 author={Yang Liu and Dan Iter and Yichong Xu and Shuohang Wang and Ruochen Xu and Chenguang Zhu}, 91 year={2023}, 92 eprint={2303.16634}, 93 archivePrefix={arXiv}, 94 primaryClass={cs.CL}, 95 url={https://arxiv.org/abs/2303.16634}, 96 } 97 """ 98 99 def __init__(self, eval_config: EvalConfig, run_config: RunConfigProperties | None): 100 if ( 101 eval_config.config_type != EvalConfigType.g_eval 102 and eval_config.config_type != EvalConfigType.llm_as_judge 103 ): 104 raise ValueError( 105 f"GEval must be initialized with a GEval or LLM as Judge config_type. Got {eval_config.config_type}" 106 ) 107 108 super().__init__(eval_config, run_config) 109 110 self.geval_task = GEvalTask(eval_config) 111 112 def generate_final_answer_run_description( 113 self, eval_input: str, eval_output: str 114 ) -> str: 115 return f"""The model was given the following input for the task: 116<eval_data> 117{eval_input} 118</eval_data> 119 120The model produced the following output for the task: 121<eval_data> 122{eval_output} 123</eval_data> 124""" 125 126 def generate_ref_ans_run_description( 127 self, eval_input: str, eval_output: str, reference_answer: str 128 ) -> str: 129 return f"""The model was given the following input for the task: 130<eval_data> 131{eval_input} 132</eval_data> 133 134The model produced the following output for the task: 135<eval_data> 136{eval_output} 137</eval_data> 138 139This is the reference answer: 140<eval_data> 141{reference_answer} 142</eval_data> 143""" 144 145 def generate_full_trace_run_description( 146 self, 147 eval_input: str, 148 available_tools: str | None, 149 conversation_history: str, 150 ) -> str: 151 description = "" 152 description += f"""The model was given the following <user_input> for the <task_description>: 153<eval_data> 154<user_input>{eval_input}</user_input> 155</eval_data> 156""" 157 # Get properties from spec if available, otherwise from eval.template_properties (for legacy evals) 158 spec = self.eval.associated_spec(readonly=True) 159 160 # Spec uses different keys than legacy eval template_properties 161 if spec: 162 # Spec: tool_use_guidelines, appropriate_tool_use_examples, inappropriate_tool_use_examples 163 tool_use_guidelines = str(spec.properties.get("tool_use_guidelines") or "") 164 appropriate_tool_use_examples = str( 165 spec.properties.get("appropriate_tool_use_examples") or "" 166 ) 167 inappropriate_tool_use_examples = str( 168 spec.properties.get("inappropriate_tool_use_examples") or "" 169 ) 170 description += f"""The model was given the following <tool_use_guidelines>: 171<eval_data> 172<tool_use_guidelines> 173{tool_use_guidelines} 174</tool_use_guidelines> 175</eval_data> 176""" 177 description += f"""The model was given the following <appropriate_tool_use_examples>: 178<eval_data> 179<appropriate_tool_use_examples> 180{appropriate_tool_use_examples} 181</appropriate_tool_use_examples> 182</eval_data> 183""" 184 description += f"""The model was given the following <inappropriate_tool_use_examples>: 185<eval_data> 186<inappropriate_tool_use_examples> 187{inappropriate_tool_use_examples} 188</inappropriate_tool_use_examples> 189</eval_data> 190""" 191 elif self.eval.template_properties: 192 # Legacy eval: appropriate_tool_use_guidelines, inappropriate_tool_use_guidelines 193 appropriate_tool_use_guidelines = str( 194 self.eval.template_properties.get("appropriate_tool_use_guidelines") 195 or "" 196 ) 197 inappropriate_tool_use_guidelines = str( 198 self.eval.template_properties.get("inappropriate_tool_use_guidelines") 199 or "" 200 ) 201 202 description += f"""The model was given the following <appropriate_tool_use_guidelines> guidelines: 203<eval_data> 204<appropriate_tool_use_guidelines> 205{appropriate_tool_use_guidelines} 206</appropriate_tool_use_guidelines> 207</eval_data> 208""" 209 # Only include if it has content since it is optional 210 if inappropriate_tool_use_guidelines: 211 description += f"""The model was given the following <inappropriate_tool_use_guidelines> guidelines: 212<eval_data> 213<inappropriate_tool_use_guidelines> 214{inappropriate_tool_use_guidelines} 215</inappropriate_tool_use_guidelines> 216</eval_data> 217""" 218 219 if available_tools is not None: 220 if available_tools != "": 221 description += f""" 222This is the list of tools available to the model: 223<eval_data> 224<available_tools>{available_tools}</available_tools> 225</eval_data> 226""" 227 else: 228 description += """ 229There were no tools available to the model. 230""" 231 232 description += f""" 233This is the full conversation history for the task run: 234<eval_data> 235<conversation_history>{conversation_history}</conversation_history> 236</eval_data> 237""" 238 return description 239 240 async def run_eval( 241 self, task_run: TaskRun, eval_job_item: TaskRun | None = None 242 ) -> tuple[EvalScores, Dict[str, str] | None]: 243 """ 244 Run this eval on the given task run. 245 """ 246 247 model_name, provider = self.model_and_provider() 248 249 # Only fetch logprobs for G-Eval 250 # There are at most 5 valid rating tokens per rating type (five_star being largest), so 10 is more than enough to get to the very very unlikely 251 top_logprobs = ( 252 10 if self.eval_config.config_type == EvalConfigType.g_eval else None 253 ) 254 255 # We don't expose setting this manually in the UI, so pull a recommended mode from ml_model_list 256 structured_output_mode = default_structured_output_mode_for_model_provider( 257 model_name, 258 provider, 259 default=StructuredOutputMode.json_schema, 260 # G-eval expects JSON, so don't allow function calling modes 261 disallowed_modes=[ 262 StructuredOutputMode.function_calling, 263 StructuredOutputMode.function_calling_weak, 264 ], 265 ) 266 267 adapter = adapter_for_task( 268 self.geval_task, 269 run_config_properties=KilnAgentRunConfigProperties( 270 model_name=model_name, 271 model_provider_name=provider, 272 # We always use Simple COT for G-Eval and LLM as Judge 273 prompt_id=PromptGenerators.SIMPLE_CHAIN_OF_THOUGHT, 274 structured_output_mode=structured_output_mode, 275 ), 276 base_adapter_config=AdapterConfig( 277 # Don't save this run into the task_runs. It will be saved into an eval_run where it belongs 278 allow_saving=False, 279 top_logprobs=top_logprobs, 280 ), 281 ) 282 283 if self.eval.evaluation_data_type == EvalDataType.full_trace: 284 if task_run.trace is None: 285 raise ValueError("Task run trace is required for full trace evaluation") 286 287 available_tools = await EvalUtils.formatted_available_tools_from_task_run( 288 task_run 289 ) 290 run_description = self.generate_full_trace_run_description( 291 task_run.input, 292 available_tools, 293 EvalTraceFormatter.trace_to_formatted_conversation_history( 294 task_run.trace 295 ), 296 ) 297 298 elif self.eval.evaluation_data_type == EvalDataType.reference_answer: 299 if eval_job_item is None: 300 raise ValueError( 301 "Eval job item is required for reference answer evaluation" 302 ) 303 run_description = self.generate_ref_ans_run_description( 304 task_run.input, task_run.output.output, eval_job_item.output.output 305 ) 306 307 else: # EvalDataType.final_answer 308 run_description = self.generate_final_answer_run_description( 309 task_run.input, task_run.output.output 310 ) 311 312 # We don't need the run, but invoke_returning_run_output() runs validations for us over _run() 313 _, run_output = await adapter.invoke_returning_run_output(run_description) 314 315 if self.eval_config.config_type == EvalConfigType.llm_as_judge: 316 return self.build_llm_as_judge_score( 317 run_output 318 ), run_output.intermediate_outputs 319 else: 320 return self.build_g_eval_score(run_output), run_output.intermediate_outputs 321 322 def build_llm_as_judge_score(self, run_output: RunOutput) -> EvalScores: 323 """ 324 Build the LLM as Judge score for the given run and run output. 325 """ 326 # Convert the output format we asked for (discreet values) to our float scores 327 scores: EvalScores = {} 328 if not isinstance(run_output.output, dict): 329 raise ValueError("LLM as Judge output must be a dictionary") 330 331 for metric, score in run_output.output.items(): 332 token_score = self.score_from_token_string(f"{score}") 333 if token_score is None: 334 raise ValueError( 335 f"No score found for metric: {metric}. The LLM failed to follow the scoring rubric/instructions/schema." 336 ) 337 scores[metric] = token_score 338 return scores 339 340 def build_g_eval_score(self, run_output: RunOutput) -> EvalScores: 341 """ 342 Build the G-Eval score for the given run and run output. 343 344 We create a weighted average of each rating using the logprobs. 345 346 @misc{liu2023gevalnlgevaluationusing, 347 title={G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment}, 348 author={Yang Liu and Dan Iter and Yichong Xu and Shuohang Wang and Ruochen Xu and Chenguang Zhu}, 349 year={2023}, 350 eprint={2303.16634}, 351 archivePrefix={arXiv}, 352 primaryClass={cs.CL}, 353 url={https://arxiv.org/abs/2303.16634}, 354 } 355 """ 356 # We use structured output 357 outputs = run_output.output 358 assert isinstance(outputs, dict) 359 360 # Build raw string output from the logprobs, which is easier to work with than Dict for the next bit 361 raw_output = self.raw_output_from_logprobs(run_output) 362 363 # find the offset the start of each metric in the raw output json 364 metrics: List[str] = list(outputs.keys()) 365 metric_offsets = self.metric_offsets(raw_output, metrics) 366 367 final_scores: EvalScores = {} 368 for metric in metrics: 369 score = self.g_eval_single_metric( 370 run_output, metric, metric_offsets, raw_output 371 ) 372 if score is None: 373 raise ValueError( 374 f"No score found for metric: {metric}. The LLM failed to follow the scoring rubric/instructions/schema." 375 ) 376 final_scores[metric] = score 377 378 return final_scores 379 380 def g_eval_single_metric( 381 self, 382 run_output: RunOutput, 383 metric: str, 384 metric_offsets: Dict[str, int], 385 raw_output: str, 386 ) -> float | None: 387 """ 388 Run the G-Eval for a single metric. 389 390 Scan the logprobs for the metric and return the weighted score of the rating token. 391 """ 392 393 start_offset, end_offset = self.token_search_range( 394 raw_output, metric, metric_offsets 395 ) 396 397 offset = 0 398 399 if ( 400 run_output.output_logprobs is None 401 or run_output.output_logprobs.content is None 402 ): 403 raise RuntimeError( 404 "No logprobs found for output - can not calculate g-eval" 405 ) 406 407 # scan the tokens in the range, looking for the rating token 408 for _, chat_logprob in enumerate(run_output.output_logprobs.content): 409 if offset >= end_offset: 410 break 411 if offset >= start_offset: 412 score = self.rating_token_to_score(chat_logprob) 413 if score is not None: 414 return score 415 offset += len(chat_logprob.token) 416 417 return None 418 419 def raw_output_from_logprobs(self, run_output: RunOutput) -> str: 420 """ 421 Build the raw output string from the logprobs. Generate from logprobs so it's guaranteed to match the logprobs offsets 422 """ 423 if ( 424 run_output.output_logprobs is None 425 or run_output.output_logprobs.content is None 426 ): 427 raise RuntimeError( 428 "No logprobs found for output - can not calculate g-eval" 429 ) 430 431 raw = "" 432 for chat_logprob in run_output.output_logprobs.content: 433 raw += chat_logprob.token 434 return raw 435 436 def token_search_range( 437 self, raw_output: str, metric: str, metric_offsets: Dict[str, int] 438 ) -> Tuple[int, int]: 439 """ 440 Find the start and end offsets of the metric in the raw output. 441 442 Start searching after the end of the target metric json entry ("overall_rating":), and before the start of the next metric ("some_other_score"). 443 """ 444 start_offset = metric_offsets[metric] + len(metric) 445 446 # Find the lowest end offset that is greater than the start offset 447 end_offset = len(raw_output) 448 for v in list(metric_offsets.values()): 449 if v < end_offset and v > start_offset: 450 end_offset = v 451 452 return start_offset, end_offset 453 454 def rating_token_to_score( 455 self, token_logprob: ChatCompletionTokenLogprob 456 ) -> float | None: 457 """ 458 Convert a rating token to a score using weighted average of top logprobs. 459 460 Only includes tokens that have valid scores. 461 462 Some cleanup for upper case, whitespace and quotes. LLMs aren't always consistent. 463 """ 464 primary_token_score = self.score_from_token_string(token_logprob.token) 465 # check this is a real rating token, it could just be the ": ", "," or whitespace 466 if primary_token_score is None: 467 return None 468 469 total_score = 0.0 470 total_probability = 0.0 471 top_logprobs_contains_primary_token = False 472 473 # Process all valid scoring tokens from alternatives 474 for top_logprob in token_logprob.top_logprobs: 475 if top_logprob.token == token_logprob.token: 476 top_logprobs_contains_primary_token = True 477 token_score = self.score_from_token_string(top_logprob.token) 478 if token_score is not None: 479 # Convert logprob to probability 480 probability = math.exp(top_logprob.logprob) 481 total_score += token_score * probability 482 total_probability += probability 483 484 # Weird OpenAI 4o bug - sometimes the primary token is included in the top logprobs, sometimes not. 485 # Add the primary token back in if excluded 486 if not top_logprobs_contains_primary_token: 487 if token_logprob.logprob == -9999.0: 488 # Another "bug" - sometimes the logprob is -9999.0. This seems to happen when the rest of the logprobs are tiny probability. 489 total_score += primary_token_score * 1.0 490 total_probability += 1.0 491 else: 492 probability = math.exp(token_logprob.logprob) 493 total_score += primary_token_score * probability 494 total_probability += probability 495 496 if total_probability <= 0.0: 497 raise RuntimeError( 498 f"No valid scoring tokens found for {token_logprob.token}. This should never happen as the token has a valid score (so it must be excluded from top logprobs). Please file a bug if you see this." 499 ) 500 501 # Normalize by total probability of valid tokens (LLM may have wanted to generate other non-rating tokens, these shouldn't lower score of rating tokens) 502 weighted_score = total_score / total_probability 503 504 return weighted_score 505 506 def score_from_token_string(self, token: str) -> float | None: 507 if token in TOKEN_TO_SCORE_MAP: 508 return TOKEN_TO_SCORE_MAP[token] 509 510 # handle more token variations like '"1"' and '"pass"' and ' paSS' and 'PASS' 511 unquoted_token = token.strip().strip('"').lower() 512 if unquoted_token in TOKEN_TO_SCORE_MAP: 513 return TOKEN_TO_SCORE_MAP[unquoted_token] 514 515 # handle numeric tokens like "1.0" 516 try: 517 float_value = float(token) 518 if float_value.is_integer(): 519 str_token = str(int(float_value)) 520 if str_token in TOKEN_TO_SCORE_MAP: 521 return TOKEN_TO_SCORE_MAP[str_token] 522 except ValueError: 523 pass 524 525 return None 526 527 def metric_offsets(self, raw_output: str, metrics: List[str]) -> Dict[str, int]: 528 """ 529 Find the offset to the start of each metric in the raw output json 530 531 For the example json: `{"overall_rating": 1}` == 1 532 533 should return: 534 { 535 "overall_rating": 1 # it's 1 character into the json string 536 } 537 """ 538 metric_offsets: Dict[str, int] = {} 539 for metric in metrics: 540 # the quoted metric name is expected in the json: `{"overall_rating": 1}` == 1 541 metric_name = f'"{metric}"' 542 543 # we expect it exactly once 544 count = raw_output.count(metric_name) 545 if count != 1: 546 raise ValueError( 547 f"Metric {metric} should appear exactly once in the output. Found {count} times" 548 ) 549 550 offset = raw_output.find(metric_name) 551 if offset == -1: 552 raise ValueError(f"Metric {metric} not found in raw output") 553 metric_offsets[metric] = offset 554 return metric_offsets
34class GEvalTask(Task, parent_of={}): 35 """ 36 Kiln task for executing a G-Eval. Can be run on any Kiln adapter which supports logprobs. 37 38 Note G-Eval implements both G-Eval and LLM as Judge as they are very similar. 39 """ 40 41 def __init__(self, eval_config: EvalConfig): 42 tmp_project = Project(name="GEval") 43 44 # Build a simple LLM as Judge system instruction 45 system_instruction = "Your job to evaluate a model's performance on a task. Blocks will be marked with <eval_data> tags.\n" 46 # Optionally add a short task description 47 task_description = eval_config.properties.get("task_description", None) 48 if task_description: 49 system_instruction += f"\nThe task the model was given is as follows:\n<eval_data>\n<task_description>{task_description}</task_description>\n</eval_data>\n" 50 51 # Build the COT eval instructions 52 steps = eval_config.properties.get("eval_steps", []) 53 if not isinstance(steps, list): 54 raise ValueError("eval_steps must be a list.") 55 if len(steps) == 1: 56 cot_instructions = "First, think step by step about the model's performance following this evaluation step:\n\n" 57 cot_instructions += f"{steps[0]}\n" 58 else: 59 cot_instructions = "First, think step by step about the model's performance following these evaluation steps:\n\n" 60 for i, step in enumerate(steps): 61 cot_instructions += f"{i + 1}) {step}\n" 62 63 eval = eval_config.parent_eval() 64 if not eval: 65 raise ValueError("Eval config must have a parent eval") 66 67 # Build the output schema from the eval's target output scores. 68 # We restrict the LLM's output scoring schema to discrete scores (pass/fail/critical/1-5) - allow_float_scores=False 69 # However, the final scores from the evaluator can be a float (see later logprob calculation, which requires discrete token outputs) 70 output_schema = BaseEval.build_score_schema(eval, allow_float_scores=False) 71 72 super().__init__( 73 name="GEval Task", 74 parent=tmp_project, 75 instruction=system_instruction, 76 thinking_instruction=cot_instructions, 77 output_json_schema=output_schema, 78 )
Kiln task for executing a G-Eval. Can be run on any Kiln adapter which supports logprobs.
Note G-Eval implements both G-Eval and LLM as Judge as they are very similar.
41 def __init__(self, eval_config: EvalConfig): 42 tmp_project = Project(name="GEval") 43 44 # Build a simple LLM as Judge system instruction 45 system_instruction = "Your job to evaluate a model's performance on a task. Blocks will be marked with <eval_data> tags.\n" 46 # Optionally add a short task description 47 task_description = eval_config.properties.get("task_description", None) 48 if task_description: 49 system_instruction += f"\nThe task the model was given is as follows:\n<eval_data>\n<task_description>{task_description}</task_description>\n</eval_data>\n" 50 51 # Build the COT eval instructions 52 steps = eval_config.properties.get("eval_steps", []) 53 if not isinstance(steps, list): 54 raise ValueError("eval_steps must be a list.") 55 if len(steps) == 1: 56 cot_instructions = "First, think step by step about the model's performance following this evaluation step:\n\n" 57 cot_instructions += f"{steps[0]}\n" 58 else: 59 cot_instructions = "First, think step by step about the model's performance following these evaluation steps:\n\n" 60 for i, step in enumerate(steps): 61 cot_instructions += f"{i + 1}) {step}\n" 62 63 eval = eval_config.parent_eval() 64 if not eval: 65 raise ValueError("Eval config must have a parent eval") 66 67 # Build the output schema from the eval's target output scores. 68 # We restrict the LLM's output scoring schema to discrete scores (pass/fail/critical/1-5) - allow_float_scores=False 69 # However, the final scores from the evaluator can be a float (see later logprob calculation, which requires discrete token outputs) 70 output_schema = BaseEval.build_score_schema(eval, allow_float_scores=False) 71 72 super().__init__( 73 name="GEval Task", 74 parent=tmp_project, 75 instruction=system_instruction, 76 thinking_instruction=cot_instructions, 77 output_json_schema=output_schema, 78 )
Create a new model by parsing and validating input data from keyword arguments.
Raises [ValidationError][pydantic_core.ValidationError] if the input data cannot be
validated to form a valid model.
self is explicitly positional-only to allow self as a field name.
Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].
337def init_private_attributes(self: BaseModel, context: Any, /) -> None: 338 """This function is meant to behave like a BaseModel method to initialise private attributes. 339 340 It takes context as an argument since that's what pydantic-core passes when calling it. 341 342 Args: 343 self: The BaseModel instance. 344 context: The context. 345 """ 346 if getattr(self, '__pydantic_private__', None) is None: 347 pydantic_private = {} 348 for name, private_attr in self.__private_attributes__.items(): 349 default = private_attr.get_default() 350 if default is not PydanticUndefined: 351 pydantic_private[name] = default 352 object_setattr(self, '__pydantic_private__', pydantic_private)
This function is meant to behave like a BaseModel method to initialise private attributes.
It takes context as an argument since that's what pydantic-core passes when calling it.
Args: self: The BaseModel instance. context: The context.
81class GEval(BaseEval): 82 """ 83 A evaluator which implements G-Eval and LLM as Judge. 84 85 G-Eval is a method of evaluating the quality of a model's output. It is a weighted average of the scores of the tokens in the output. The weights are the log probabilities of the tokens in the output. https://arxiv.org/abs/2303.16634 86 87 LLM as Judge is a method of evaluating the quality of a model's output. It simply asks the LLM to score, and uses the returned output (no logprobs needed). Also called direct evaluation. 88 89 @misc{liu2023gevalnlgevaluationusing, 90 title={G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment}, 91 author={Yang Liu and Dan Iter and Yichong Xu and Shuohang Wang and Ruochen Xu and Chenguang Zhu}, 92 year={2023}, 93 eprint={2303.16634}, 94 archivePrefix={arXiv}, 95 primaryClass={cs.CL}, 96 url={https://arxiv.org/abs/2303.16634}, 97 } 98 """ 99 100 def __init__(self, eval_config: EvalConfig, run_config: RunConfigProperties | None): 101 if ( 102 eval_config.config_type != EvalConfigType.g_eval 103 and eval_config.config_type != EvalConfigType.llm_as_judge 104 ): 105 raise ValueError( 106 f"GEval must be initialized with a GEval or LLM as Judge config_type. Got {eval_config.config_type}" 107 ) 108 109 super().__init__(eval_config, run_config) 110 111 self.geval_task = GEvalTask(eval_config) 112 113 def generate_final_answer_run_description( 114 self, eval_input: str, eval_output: str 115 ) -> str: 116 return f"""The model was given the following input for the task: 117<eval_data> 118{eval_input} 119</eval_data> 120 121The model produced the following output for the task: 122<eval_data> 123{eval_output} 124</eval_data> 125""" 126 127 def generate_ref_ans_run_description( 128 self, eval_input: str, eval_output: str, reference_answer: str 129 ) -> str: 130 return f"""The model was given the following input for the task: 131<eval_data> 132{eval_input} 133</eval_data> 134 135The model produced the following output for the task: 136<eval_data> 137{eval_output} 138</eval_data> 139 140This is the reference answer: 141<eval_data> 142{reference_answer} 143</eval_data> 144""" 145 146 def generate_full_trace_run_description( 147 self, 148 eval_input: str, 149 available_tools: str | None, 150 conversation_history: str, 151 ) -> str: 152 description = "" 153 description += f"""The model was given the following <user_input> for the <task_description>: 154<eval_data> 155<user_input>{eval_input}</user_input> 156</eval_data> 157""" 158 # Get properties from spec if available, otherwise from eval.template_properties (for legacy evals) 159 spec = self.eval.associated_spec(readonly=True) 160 161 # Spec uses different keys than legacy eval template_properties 162 if spec: 163 # Spec: tool_use_guidelines, appropriate_tool_use_examples, inappropriate_tool_use_examples 164 tool_use_guidelines = str(spec.properties.get("tool_use_guidelines") or "") 165 appropriate_tool_use_examples = str( 166 spec.properties.get("appropriate_tool_use_examples") or "" 167 ) 168 inappropriate_tool_use_examples = str( 169 spec.properties.get("inappropriate_tool_use_examples") or "" 170 ) 171 description += f"""The model was given the following <tool_use_guidelines>: 172<eval_data> 173<tool_use_guidelines> 174{tool_use_guidelines} 175</tool_use_guidelines> 176</eval_data> 177""" 178 description += f"""The model was given the following <appropriate_tool_use_examples>: 179<eval_data> 180<appropriate_tool_use_examples> 181{appropriate_tool_use_examples} 182</appropriate_tool_use_examples> 183</eval_data> 184""" 185 description += f"""The model was given the following <inappropriate_tool_use_examples>: 186<eval_data> 187<inappropriate_tool_use_examples> 188{inappropriate_tool_use_examples} 189</inappropriate_tool_use_examples> 190</eval_data> 191""" 192 elif self.eval.template_properties: 193 # Legacy eval: appropriate_tool_use_guidelines, inappropriate_tool_use_guidelines 194 appropriate_tool_use_guidelines = str( 195 self.eval.template_properties.get("appropriate_tool_use_guidelines") 196 or "" 197 ) 198 inappropriate_tool_use_guidelines = str( 199 self.eval.template_properties.get("inappropriate_tool_use_guidelines") 200 or "" 201 ) 202 203 description += f"""The model was given the following <appropriate_tool_use_guidelines> guidelines: 204<eval_data> 205<appropriate_tool_use_guidelines> 206{appropriate_tool_use_guidelines} 207</appropriate_tool_use_guidelines> 208</eval_data> 209""" 210 # Only include if it has content since it is optional 211 if inappropriate_tool_use_guidelines: 212 description += f"""The model was given the following <inappropriate_tool_use_guidelines> guidelines: 213<eval_data> 214<inappropriate_tool_use_guidelines> 215{inappropriate_tool_use_guidelines} 216</inappropriate_tool_use_guidelines> 217</eval_data> 218""" 219 220 if available_tools is not None: 221 if available_tools != "": 222 description += f""" 223This is the list of tools available to the model: 224<eval_data> 225<available_tools>{available_tools}</available_tools> 226</eval_data> 227""" 228 else: 229 description += """ 230There were no tools available to the model. 231""" 232 233 description += f""" 234This is the full conversation history for the task run: 235<eval_data> 236<conversation_history>{conversation_history}</conversation_history> 237</eval_data> 238""" 239 return description 240 241 async def run_eval( 242 self, task_run: TaskRun, eval_job_item: TaskRun | None = None 243 ) -> tuple[EvalScores, Dict[str, str] | None]: 244 """ 245 Run this eval on the given task run. 246 """ 247 248 model_name, provider = self.model_and_provider() 249 250 # Only fetch logprobs for G-Eval 251 # There are at most 5 valid rating tokens per rating type (five_star being largest), so 10 is more than enough to get to the very very unlikely 252 top_logprobs = ( 253 10 if self.eval_config.config_type == EvalConfigType.g_eval else None 254 ) 255 256 # We don't expose setting this manually in the UI, so pull a recommended mode from ml_model_list 257 structured_output_mode = default_structured_output_mode_for_model_provider( 258 model_name, 259 provider, 260 default=StructuredOutputMode.json_schema, 261 # G-eval expects JSON, so don't allow function calling modes 262 disallowed_modes=[ 263 StructuredOutputMode.function_calling, 264 StructuredOutputMode.function_calling_weak, 265 ], 266 ) 267 268 adapter = adapter_for_task( 269 self.geval_task, 270 run_config_properties=KilnAgentRunConfigProperties( 271 model_name=model_name, 272 model_provider_name=provider, 273 # We always use Simple COT for G-Eval and LLM as Judge 274 prompt_id=PromptGenerators.SIMPLE_CHAIN_OF_THOUGHT, 275 structured_output_mode=structured_output_mode, 276 ), 277 base_adapter_config=AdapterConfig( 278 # Don't save this run into the task_runs. It will be saved into an eval_run where it belongs 279 allow_saving=False, 280 top_logprobs=top_logprobs, 281 ), 282 ) 283 284 if self.eval.evaluation_data_type == EvalDataType.full_trace: 285 if task_run.trace is None: 286 raise ValueError("Task run trace is required for full trace evaluation") 287 288 available_tools = await EvalUtils.formatted_available_tools_from_task_run( 289 task_run 290 ) 291 run_description = self.generate_full_trace_run_description( 292 task_run.input, 293 available_tools, 294 EvalTraceFormatter.trace_to_formatted_conversation_history( 295 task_run.trace 296 ), 297 ) 298 299 elif self.eval.evaluation_data_type == EvalDataType.reference_answer: 300 if eval_job_item is None: 301 raise ValueError( 302 "Eval job item is required for reference answer evaluation" 303 ) 304 run_description = self.generate_ref_ans_run_description( 305 task_run.input, task_run.output.output, eval_job_item.output.output 306 ) 307 308 else: # EvalDataType.final_answer 309 run_description = self.generate_final_answer_run_description( 310 task_run.input, task_run.output.output 311 ) 312 313 # We don't need the run, but invoke_returning_run_output() runs validations for us over _run() 314 _, run_output = await adapter.invoke_returning_run_output(run_description) 315 316 if self.eval_config.config_type == EvalConfigType.llm_as_judge: 317 return self.build_llm_as_judge_score( 318 run_output 319 ), run_output.intermediate_outputs 320 else: 321 return self.build_g_eval_score(run_output), run_output.intermediate_outputs 322 323 def build_llm_as_judge_score(self, run_output: RunOutput) -> EvalScores: 324 """ 325 Build the LLM as Judge score for the given run and run output. 326 """ 327 # Convert the output format we asked for (discreet values) to our float scores 328 scores: EvalScores = {} 329 if not isinstance(run_output.output, dict): 330 raise ValueError("LLM as Judge output must be a dictionary") 331 332 for metric, score in run_output.output.items(): 333 token_score = self.score_from_token_string(f"{score}") 334 if token_score is None: 335 raise ValueError( 336 f"No score found for metric: {metric}. The LLM failed to follow the scoring rubric/instructions/schema." 337 ) 338 scores[metric] = token_score 339 return scores 340 341 def build_g_eval_score(self, run_output: RunOutput) -> EvalScores: 342 """ 343 Build the G-Eval score for the given run and run output. 344 345 We create a weighted average of each rating using the logprobs. 346 347 @misc{liu2023gevalnlgevaluationusing, 348 title={G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment}, 349 author={Yang Liu and Dan Iter and Yichong Xu and Shuohang Wang and Ruochen Xu and Chenguang Zhu}, 350 year={2023}, 351 eprint={2303.16634}, 352 archivePrefix={arXiv}, 353 primaryClass={cs.CL}, 354 url={https://arxiv.org/abs/2303.16634}, 355 } 356 """ 357 # We use structured output 358 outputs = run_output.output 359 assert isinstance(outputs, dict) 360 361 # Build raw string output from the logprobs, which is easier to work with than Dict for the next bit 362 raw_output = self.raw_output_from_logprobs(run_output) 363 364 # find the offset the start of each metric in the raw output json 365 metrics: List[str] = list(outputs.keys()) 366 metric_offsets = self.metric_offsets(raw_output, metrics) 367 368 final_scores: EvalScores = {} 369 for metric in metrics: 370 score = self.g_eval_single_metric( 371 run_output, metric, metric_offsets, raw_output 372 ) 373 if score is None: 374 raise ValueError( 375 f"No score found for metric: {metric}. The LLM failed to follow the scoring rubric/instructions/schema." 376 ) 377 final_scores[metric] = score 378 379 return final_scores 380 381 def g_eval_single_metric( 382 self, 383 run_output: RunOutput, 384 metric: str, 385 metric_offsets: Dict[str, int], 386 raw_output: str, 387 ) -> float | None: 388 """ 389 Run the G-Eval for a single metric. 390 391 Scan the logprobs for the metric and return the weighted score of the rating token. 392 """ 393 394 start_offset, end_offset = self.token_search_range( 395 raw_output, metric, metric_offsets 396 ) 397 398 offset = 0 399 400 if ( 401 run_output.output_logprobs is None 402 or run_output.output_logprobs.content is None 403 ): 404 raise RuntimeError( 405 "No logprobs found for output - can not calculate g-eval" 406 ) 407 408 # scan the tokens in the range, looking for the rating token 409 for _, chat_logprob in enumerate(run_output.output_logprobs.content): 410 if offset >= end_offset: 411 break 412 if offset >= start_offset: 413 score = self.rating_token_to_score(chat_logprob) 414 if score is not None: 415 return score 416 offset += len(chat_logprob.token) 417 418 return None 419 420 def raw_output_from_logprobs(self, run_output: RunOutput) -> str: 421 """ 422 Build the raw output string from the logprobs. Generate from logprobs so it's guaranteed to match the logprobs offsets 423 """ 424 if ( 425 run_output.output_logprobs is None 426 or run_output.output_logprobs.content is None 427 ): 428 raise RuntimeError( 429 "No logprobs found for output - can not calculate g-eval" 430 ) 431 432 raw = "" 433 for chat_logprob in run_output.output_logprobs.content: 434 raw += chat_logprob.token 435 return raw 436 437 def token_search_range( 438 self, raw_output: str, metric: str, metric_offsets: Dict[str, int] 439 ) -> Tuple[int, int]: 440 """ 441 Find the start and end offsets of the metric in the raw output. 442 443 Start searching after the end of the target metric json entry ("overall_rating":), and before the start of the next metric ("some_other_score"). 444 """ 445 start_offset = metric_offsets[metric] + len(metric) 446 447 # Find the lowest end offset that is greater than the start offset 448 end_offset = len(raw_output) 449 for v in list(metric_offsets.values()): 450 if v < end_offset and v > start_offset: 451 end_offset = v 452 453 return start_offset, end_offset 454 455 def rating_token_to_score( 456 self, token_logprob: ChatCompletionTokenLogprob 457 ) -> float | None: 458 """ 459 Convert a rating token to a score using weighted average of top logprobs. 460 461 Only includes tokens that have valid scores. 462 463 Some cleanup for upper case, whitespace and quotes. LLMs aren't always consistent. 464 """ 465 primary_token_score = self.score_from_token_string(token_logprob.token) 466 # check this is a real rating token, it could just be the ": ", "," or whitespace 467 if primary_token_score is None: 468 return None 469 470 total_score = 0.0 471 total_probability = 0.0 472 top_logprobs_contains_primary_token = False 473 474 # Process all valid scoring tokens from alternatives 475 for top_logprob in token_logprob.top_logprobs: 476 if top_logprob.token == token_logprob.token: 477 top_logprobs_contains_primary_token = True 478 token_score = self.score_from_token_string(top_logprob.token) 479 if token_score is not None: 480 # Convert logprob to probability 481 probability = math.exp(top_logprob.logprob) 482 total_score += token_score * probability 483 total_probability += probability 484 485 # Weird OpenAI 4o bug - sometimes the primary token is included in the top logprobs, sometimes not. 486 # Add the primary token back in if excluded 487 if not top_logprobs_contains_primary_token: 488 if token_logprob.logprob == -9999.0: 489 # Another "bug" - sometimes the logprob is -9999.0. This seems to happen when the rest of the logprobs are tiny probability. 490 total_score += primary_token_score * 1.0 491 total_probability += 1.0 492 else: 493 probability = math.exp(token_logprob.logprob) 494 total_score += primary_token_score * probability 495 total_probability += probability 496 497 if total_probability <= 0.0: 498 raise RuntimeError( 499 f"No valid scoring tokens found for {token_logprob.token}. This should never happen as the token has a valid score (so it must be excluded from top logprobs). Please file a bug if you see this." 500 ) 501 502 # Normalize by total probability of valid tokens (LLM may have wanted to generate other non-rating tokens, these shouldn't lower score of rating tokens) 503 weighted_score = total_score / total_probability 504 505 return weighted_score 506 507 def score_from_token_string(self, token: str) -> float | None: 508 if token in TOKEN_TO_SCORE_MAP: 509 return TOKEN_TO_SCORE_MAP[token] 510 511 # handle more token variations like '"1"' and '"pass"' and ' paSS' and 'PASS' 512 unquoted_token = token.strip().strip('"').lower() 513 if unquoted_token in TOKEN_TO_SCORE_MAP: 514 return TOKEN_TO_SCORE_MAP[unquoted_token] 515 516 # handle numeric tokens like "1.0" 517 try: 518 float_value = float(token) 519 if float_value.is_integer(): 520 str_token = str(int(float_value)) 521 if str_token in TOKEN_TO_SCORE_MAP: 522 return TOKEN_TO_SCORE_MAP[str_token] 523 except ValueError: 524 pass 525 526 return None 527 528 def metric_offsets(self, raw_output: str, metrics: List[str]) -> Dict[str, int]: 529 """ 530 Find the offset to the start of each metric in the raw output json 531 532 For the example json: `{"overall_rating": 1}` == 1 533 534 should return: 535 { 536 "overall_rating": 1 # it's 1 character into the json string 537 } 538 """ 539 metric_offsets: Dict[str, int] = {} 540 for metric in metrics: 541 # the quoted metric name is expected in the json: `{"overall_rating": 1}` == 1 542 metric_name = f'"{metric}"' 543 544 # we expect it exactly once 545 count = raw_output.count(metric_name) 546 if count != 1: 547 raise ValueError( 548 f"Metric {metric} should appear exactly once in the output. Found {count} times" 549 ) 550 551 offset = raw_output.find(metric_name) 552 if offset == -1: 553 raise ValueError(f"Metric {metric} not found in raw output") 554 metric_offsets[metric] = offset 555 return metric_offsets
A evaluator which implements G-Eval and LLM as Judge.
G-Eval is a method of evaluating the quality of a model's output. It is a weighted average of the scores of the tokens in the output. The weights are the log probabilities of the tokens in the output. https://arxiv.org/abs/2303.16634
LLM as Judge is a method of evaluating the quality of a model's output. It simply asks the LLM to score, and uses the returned output (no logprobs needed). Also called direct evaluation.
@misc{liu2023gevalnlgevaluationusing, title={G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment}, author={Yang Liu and Dan Iter and Yichong Xu and Shuohang Wang and Ruochen Xu and Chenguang Zhu}, year={2023}, eprint={2303.16634}, archivePrefix={arXiv}, primaryClass={cs.CL}, url={https://arxiv.org/abs/2303.16634}, }
100 def __init__(self, eval_config: EvalConfig, run_config: RunConfigProperties | None): 101 if ( 102 eval_config.config_type != EvalConfigType.g_eval 103 and eval_config.config_type != EvalConfigType.llm_as_judge 104 ): 105 raise ValueError( 106 f"GEval must be initialized with a GEval or LLM as Judge config_type. Got {eval_config.config_type}" 107 ) 108 109 super().__init__(eval_config, run_config) 110 111 self.geval_task = GEvalTask(eval_config)
113 def generate_final_answer_run_description( 114 self, eval_input: str, eval_output: str 115 ) -> str: 116 return f"""The model was given the following input for the task: 117<eval_data> 118{eval_input} 119</eval_data> 120 121The model produced the following output for the task: 122<eval_data> 123{eval_output} 124</eval_data> 125"""
127 def generate_ref_ans_run_description( 128 self, eval_input: str, eval_output: str, reference_answer: str 129 ) -> str: 130 return f"""The model was given the following input for the task: 131<eval_data> 132{eval_input} 133</eval_data> 134 135The model produced the following output for the task: 136<eval_data> 137{eval_output} 138</eval_data> 139 140This is the reference answer: 141<eval_data> 142{reference_answer} 143</eval_data> 144"""
146 def generate_full_trace_run_description( 147 self, 148 eval_input: str, 149 available_tools: str | None, 150 conversation_history: str, 151 ) -> str: 152 description = "" 153 description += f"""The model was given the following <user_input> for the <task_description>: 154<eval_data> 155<user_input>{eval_input}</user_input> 156</eval_data> 157""" 158 # Get properties from spec if available, otherwise from eval.template_properties (for legacy evals) 159 spec = self.eval.associated_spec(readonly=True) 160 161 # Spec uses different keys than legacy eval template_properties 162 if spec: 163 # Spec: tool_use_guidelines, appropriate_tool_use_examples, inappropriate_tool_use_examples 164 tool_use_guidelines = str(spec.properties.get("tool_use_guidelines") or "") 165 appropriate_tool_use_examples = str( 166 spec.properties.get("appropriate_tool_use_examples") or "" 167 ) 168 inappropriate_tool_use_examples = str( 169 spec.properties.get("inappropriate_tool_use_examples") or "" 170 ) 171 description += f"""The model was given the following <tool_use_guidelines>: 172<eval_data> 173<tool_use_guidelines> 174{tool_use_guidelines} 175</tool_use_guidelines> 176</eval_data> 177""" 178 description += f"""The model was given the following <appropriate_tool_use_examples>: 179<eval_data> 180<appropriate_tool_use_examples> 181{appropriate_tool_use_examples} 182</appropriate_tool_use_examples> 183</eval_data> 184""" 185 description += f"""The model was given the following <inappropriate_tool_use_examples>: 186<eval_data> 187<inappropriate_tool_use_examples> 188{inappropriate_tool_use_examples} 189</inappropriate_tool_use_examples> 190</eval_data> 191""" 192 elif self.eval.template_properties: 193 # Legacy eval: appropriate_tool_use_guidelines, inappropriate_tool_use_guidelines 194 appropriate_tool_use_guidelines = str( 195 self.eval.template_properties.get("appropriate_tool_use_guidelines") 196 or "" 197 ) 198 inappropriate_tool_use_guidelines = str( 199 self.eval.template_properties.get("inappropriate_tool_use_guidelines") 200 or "" 201 ) 202 203 description += f"""The model was given the following <appropriate_tool_use_guidelines> guidelines: 204<eval_data> 205<appropriate_tool_use_guidelines> 206{appropriate_tool_use_guidelines} 207</appropriate_tool_use_guidelines> 208</eval_data> 209""" 210 # Only include if it has content since it is optional 211 if inappropriate_tool_use_guidelines: 212 description += f"""The model was given the following <inappropriate_tool_use_guidelines> guidelines: 213<eval_data> 214<inappropriate_tool_use_guidelines> 215{inappropriate_tool_use_guidelines} 216</inappropriate_tool_use_guidelines> 217</eval_data> 218""" 219 220 if available_tools is not None: 221 if available_tools != "": 222 description += f""" 223This is the list of tools available to the model: 224<eval_data> 225<available_tools>{available_tools}</available_tools> 226</eval_data> 227""" 228 else: 229 description += """ 230There were no tools available to the model. 231""" 232 233 description += f""" 234This is the full conversation history for the task run: 235<eval_data> 236<conversation_history>{conversation_history}</conversation_history> 237</eval_data> 238""" 239 return description
241 async def run_eval( 242 self, task_run: TaskRun, eval_job_item: TaskRun | None = None 243 ) -> tuple[EvalScores, Dict[str, str] | None]: 244 """ 245 Run this eval on the given task run. 246 """ 247 248 model_name, provider = self.model_and_provider() 249 250 # Only fetch logprobs for G-Eval 251 # There are at most 5 valid rating tokens per rating type (five_star being largest), so 10 is more than enough to get to the very very unlikely 252 top_logprobs = ( 253 10 if self.eval_config.config_type == EvalConfigType.g_eval else None 254 ) 255 256 # We don't expose setting this manually in the UI, so pull a recommended mode from ml_model_list 257 structured_output_mode = default_structured_output_mode_for_model_provider( 258 model_name, 259 provider, 260 default=StructuredOutputMode.json_schema, 261 # G-eval expects JSON, so don't allow function calling modes 262 disallowed_modes=[ 263 StructuredOutputMode.function_calling, 264 StructuredOutputMode.function_calling_weak, 265 ], 266 ) 267 268 adapter = adapter_for_task( 269 self.geval_task, 270 run_config_properties=KilnAgentRunConfigProperties( 271 model_name=model_name, 272 model_provider_name=provider, 273 # We always use Simple COT for G-Eval and LLM as Judge 274 prompt_id=PromptGenerators.SIMPLE_CHAIN_OF_THOUGHT, 275 structured_output_mode=structured_output_mode, 276 ), 277 base_adapter_config=AdapterConfig( 278 # Don't save this run into the task_runs. It will be saved into an eval_run where it belongs 279 allow_saving=False, 280 top_logprobs=top_logprobs, 281 ), 282 ) 283 284 if self.eval.evaluation_data_type == EvalDataType.full_trace: 285 if task_run.trace is None: 286 raise ValueError("Task run trace is required for full trace evaluation") 287 288 available_tools = await EvalUtils.formatted_available_tools_from_task_run( 289 task_run 290 ) 291 run_description = self.generate_full_trace_run_description( 292 task_run.input, 293 available_tools, 294 EvalTraceFormatter.trace_to_formatted_conversation_history( 295 task_run.trace 296 ), 297 ) 298 299 elif self.eval.evaluation_data_type == EvalDataType.reference_answer: 300 if eval_job_item is None: 301 raise ValueError( 302 "Eval job item is required for reference answer evaluation" 303 ) 304 run_description = self.generate_ref_ans_run_description( 305 task_run.input, task_run.output.output, eval_job_item.output.output 306 ) 307 308 else: # EvalDataType.final_answer 309 run_description = self.generate_final_answer_run_description( 310 task_run.input, task_run.output.output 311 ) 312 313 # We don't need the run, but invoke_returning_run_output() runs validations for us over _run() 314 _, run_output = await adapter.invoke_returning_run_output(run_description) 315 316 if self.eval_config.config_type == EvalConfigType.llm_as_judge: 317 return self.build_llm_as_judge_score( 318 run_output 319 ), run_output.intermediate_outputs 320 else: 321 return self.build_g_eval_score(run_output), run_output.intermediate_outputs
Run this eval on the given task run.
323 def build_llm_as_judge_score(self, run_output: RunOutput) -> EvalScores: 324 """ 325 Build the LLM as Judge score for the given run and run output. 326 """ 327 # Convert the output format we asked for (discreet values) to our float scores 328 scores: EvalScores = {} 329 if not isinstance(run_output.output, dict): 330 raise ValueError("LLM as Judge output must be a dictionary") 331 332 for metric, score in run_output.output.items(): 333 token_score = self.score_from_token_string(f"{score}") 334 if token_score is None: 335 raise ValueError( 336 f"No score found for metric: {metric}. The LLM failed to follow the scoring rubric/instructions/schema." 337 ) 338 scores[metric] = token_score 339 return scores
Build the LLM as Judge score for the given run and run output.
341 def build_g_eval_score(self, run_output: RunOutput) -> EvalScores: 342 """ 343 Build the G-Eval score for the given run and run output. 344 345 We create a weighted average of each rating using the logprobs. 346 347 @misc{liu2023gevalnlgevaluationusing, 348 title={G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment}, 349 author={Yang Liu and Dan Iter and Yichong Xu and Shuohang Wang and Ruochen Xu and Chenguang Zhu}, 350 year={2023}, 351 eprint={2303.16634}, 352 archivePrefix={arXiv}, 353 primaryClass={cs.CL}, 354 url={https://arxiv.org/abs/2303.16634}, 355 } 356 """ 357 # We use structured output 358 outputs = run_output.output 359 assert isinstance(outputs, dict) 360 361 # Build raw string output from the logprobs, which is easier to work with than Dict for the next bit 362 raw_output = self.raw_output_from_logprobs(run_output) 363 364 # find the offset the start of each metric in the raw output json 365 metrics: List[str] = list(outputs.keys()) 366 metric_offsets = self.metric_offsets(raw_output, metrics) 367 368 final_scores: EvalScores = {} 369 for metric in metrics: 370 score = self.g_eval_single_metric( 371 run_output, metric, metric_offsets, raw_output 372 ) 373 if score is None: 374 raise ValueError( 375 f"No score found for metric: {metric}. The LLM failed to follow the scoring rubric/instructions/schema." 376 ) 377 final_scores[metric] = score 378 379 return final_scores
Build the G-Eval score for the given run and run output.
We create a weighted average of each rating using the logprobs.
@misc{liu2023gevalnlgevaluationusing, title={G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment}, author={Yang Liu and Dan Iter and Yichong Xu and Shuohang Wang and Ruochen Xu and Chenguang Zhu}, year={2023}, eprint={2303.16634}, archivePrefix={arXiv}, primaryClass={cs.CL}, url={https://arxiv.org/abs/2303.16634}, }
381 def g_eval_single_metric( 382 self, 383 run_output: RunOutput, 384 metric: str, 385 metric_offsets: Dict[str, int], 386 raw_output: str, 387 ) -> float | None: 388 """ 389 Run the G-Eval for a single metric. 390 391 Scan the logprobs for the metric and return the weighted score of the rating token. 392 """ 393 394 start_offset, end_offset = self.token_search_range( 395 raw_output, metric, metric_offsets 396 ) 397 398 offset = 0 399 400 if ( 401 run_output.output_logprobs is None 402 or run_output.output_logprobs.content is None 403 ): 404 raise RuntimeError( 405 "No logprobs found for output - can not calculate g-eval" 406 ) 407 408 # scan the tokens in the range, looking for the rating token 409 for _, chat_logprob in enumerate(run_output.output_logprobs.content): 410 if offset >= end_offset: 411 break 412 if offset >= start_offset: 413 score = self.rating_token_to_score(chat_logprob) 414 if score is not None: 415 return score 416 offset += len(chat_logprob.token) 417 418 return None
Run the G-Eval for a single metric.
Scan the logprobs for the metric and return the weighted score of the rating token.
420 def raw_output_from_logprobs(self, run_output: RunOutput) -> str: 421 """ 422 Build the raw output string from the logprobs. Generate from logprobs so it's guaranteed to match the logprobs offsets 423 """ 424 if ( 425 run_output.output_logprobs is None 426 or run_output.output_logprobs.content is None 427 ): 428 raise RuntimeError( 429 "No logprobs found for output - can not calculate g-eval" 430 ) 431 432 raw = "" 433 for chat_logprob in run_output.output_logprobs.content: 434 raw += chat_logprob.token 435 return raw
Build the raw output string from the logprobs. Generate from logprobs so it's guaranteed to match the logprobs offsets
437 def token_search_range( 438 self, raw_output: str, metric: str, metric_offsets: Dict[str, int] 439 ) -> Tuple[int, int]: 440 """ 441 Find the start and end offsets of the metric in the raw output. 442 443 Start searching after the end of the target metric json entry ("overall_rating":), and before the start of the next metric ("some_other_score"). 444 """ 445 start_offset = metric_offsets[metric] + len(metric) 446 447 # Find the lowest end offset that is greater than the start offset 448 end_offset = len(raw_output) 449 for v in list(metric_offsets.values()): 450 if v < end_offset and v > start_offset: 451 end_offset = v 452 453 return start_offset, end_offset
Find the start and end offsets of the metric in the raw output.
Start searching after the end of the target metric json entry ("overall_rating":), and before the start of the next metric ("some_other_score").
455 def rating_token_to_score( 456 self, token_logprob: ChatCompletionTokenLogprob 457 ) -> float | None: 458 """ 459 Convert a rating token to a score using weighted average of top logprobs. 460 461 Only includes tokens that have valid scores. 462 463 Some cleanup for upper case, whitespace and quotes. LLMs aren't always consistent. 464 """ 465 primary_token_score = self.score_from_token_string(token_logprob.token) 466 # check this is a real rating token, it could just be the ": ", "," or whitespace 467 if primary_token_score is None: 468 return None 469 470 total_score = 0.0 471 total_probability = 0.0 472 top_logprobs_contains_primary_token = False 473 474 # Process all valid scoring tokens from alternatives 475 for top_logprob in token_logprob.top_logprobs: 476 if top_logprob.token == token_logprob.token: 477 top_logprobs_contains_primary_token = True 478 token_score = self.score_from_token_string(top_logprob.token) 479 if token_score is not None: 480 # Convert logprob to probability 481 probability = math.exp(top_logprob.logprob) 482 total_score += token_score * probability 483 total_probability += probability 484 485 # Weird OpenAI 4o bug - sometimes the primary token is included in the top logprobs, sometimes not. 486 # Add the primary token back in if excluded 487 if not top_logprobs_contains_primary_token: 488 if token_logprob.logprob == -9999.0: 489 # Another "bug" - sometimes the logprob is -9999.0. This seems to happen when the rest of the logprobs are tiny probability. 490 total_score += primary_token_score * 1.0 491 total_probability += 1.0 492 else: 493 probability = math.exp(token_logprob.logprob) 494 total_score += primary_token_score * probability 495 total_probability += probability 496 497 if total_probability <= 0.0: 498 raise RuntimeError( 499 f"No valid scoring tokens found for {token_logprob.token}. This should never happen as the token has a valid score (so it must be excluded from top logprobs). Please file a bug if you see this." 500 ) 501 502 # Normalize by total probability of valid tokens (LLM may have wanted to generate other non-rating tokens, these shouldn't lower score of rating tokens) 503 weighted_score = total_score / total_probability 504 505 return weighted_score
Convert a rating token to a score using weighted average of top logprobs.
Only includes tokens that have valid scores.
Some cleanup for upper case, whitespace and quotes. LLMs aren't always consistent.
507 def score_from_token_string(self, token: str) -> float | None: 508 if token in TOKEN_TO_SCORE_MAP: 509 return TOKEN_TO_SCORE_MAP[token] 510 511 # handle more token variations like '"1"' and '"pass"' and ' paSS' and 'PASS' 512 unquoted_token = token.strip().strip('"').lower() 513 if unquoted_token in TOKEN_TO_SCORE_MAP: 514 return TOKEN_TO_SCORE_MAP[unquoted_token] 515 516 # handle numeric tokens like "1.0" 517 try: 518 float_value = float(token) 519 if float_value.is_integer(): 520 str_token = str(int(float_value)) 521 if str_token in TOKEN_TO_SCORE_MAP: 522 return TOKEN_TO_SCORE_MAP[str_token] 523 except ValueError: 524 pass 525 526 return None
528 def metric_offsets(self, raw_output: str, metrics: List[str]) -> Dict[str, int]: 529 """ 530 Find the offset to the start of each metric in the raw output json 531 532 For the example json: `{"overall_rating": 1}` == 1 533 534 should return: 535 { 536 "overall_rating": 1 # it's 1 character into the json string 537 } 538 """ 539 metric_offsets: Dict[str, int] = {} 540 for metric in metrics: 541 # the quoted metric name is expected in the json: `{"overall_rating": 1}` == 1 542 metric_name = f'"{metric}"' 543 544 # we expect it exactly once 545 count = raw_output.count(metric_name) 546 if count != 1: 547 raise ValueError( 548 f"Metric {metric} should appear exactly once in the output. Found {count} times" 549 ) 550 551 offset = raw_output.find(metric_name) 552 if offset == -1: 553 raise ValueError(f"Metric {metric} not found in raw output") 554 metric_offsets[metric] = offset 555 return metric_offsets
Find the offset to the start of each metric in the raw output json
For the example json: {"overall_rating": 1} == 1
should return: { "overall_rating": 1 # it's 1 character into the json string }