kiln_ai.adapters.eval.g_eval
1import math 2from typing import Dict, List, Tuple 3 4from litellm.types.utils import ChatCompletionTokenLogprob 5 6from kiln_ai.adapters.adapter_registry import adapter_for_task 7from kiln_ai.adapters.eval.base_eval import BaseEval 8from kiln_ai.adapters.eval.eval_utils.eval_trace_formatter import EvalTraceFormatter 9from kiln_ai.adapters.eval.eval_utils.eval_utils import EvalUtils 10from kiln_ai.adapters.ml_model_list import ( 11 default_structured_output_mode_for_model_provider, 12) 13from kiln_ai.adapters.model_adapters.base_adapter import ( 14 AdapterConfig, 15 RunOutput, 16 SkillsDict, 17) 18from kiln_ai.adapters.prompt_builders import PromptGenerators 19from kiln_ai.datamodel import Project, Task, TaskRun 20from kiln_ai.datamodel.eval import EvalConfig, EvalConfigType, EvalDataType, EvalScores 21from kiln_ai.datamodel.run_config import KilnAgentRunConfigProperties 22from kiln_ai.datamodel.task import RunConfigProperties, StructuredOutputMode 23 24# all the tokens we score for, and their float scores. 25TOKEN_TO_SCORE_MAP: Dict[str, float] = { 26 "1": 1.0, 27 "2": 2.0, 28 "3": 3.0, 29 "4": 4.0, 30 "5": 5.0, 31 "pass": 1.0, 32 "fail": 0.0, 33 "critical": -1.0, 34} 35 36 37class GEvalTask(Task, parent_of={}): 38 """ 39 Kiln task for executing a G-Eval. Can be run on any Kiln adapter which supports logprobs. 40 41 Note G-Eval implements both G-Eval and LLM as Judge as they are very similar. 42 """ 43 44 def __init__(self, eval_config: EvalConfig): 45 tmp_project = Project(name="GEval") 46 47 # Build a simple LLM as Judge system instruction 48 system_instruction = "Your job to evaluate a model's performance on a task. Blocks will be marked with <eval_data> tags.\n" 49 # Optionally add a short task description 50 task_description = eval_config.properties.get("task_description", None) 51 if task_description: 52 system_instruction += f"\nThe task the model was given is as follows:\n<eval_data>\n<task_description>{task_description}</task_description>\n</eval_data>\n" 53 54 # Build the COT eval instructions 55 steps = eval_config.properties.get("eval_steps", []) 56 if not isinstance(steps, list): 57 raise ValueError("eval_steps must be a list.") 58 if len(steps) == 1: 59 cot_instructions = "First, think step by step about the model's performance following this evaluation step:\n\n" 60 cot_instructions += f"{steps[0]}\n" 61 else: 62 cot_instructions = "First, think step by step about the model's performance following these evaluation steps:\n\n" 63 for i, step in enumerate(steps): 64 cot_instructions += f"{i + 1}) {step}\n" 65 66 eval = eval_config.parent_eval() 67 if not eval: 68 raise ValueError("Eval config must have a parent eval") 69 70 # Build the output schema from the eval's target output scores. 71 # We restrict the LLM's output scoring schema to discrete scores (pass/fail/critical/1-5) - allow_float_scores=False 72 # However, the final scores from the evaluator can be a float (see later logprob calculation, which requires discrete token outputs) 73 output_schema = BaseEval.build_score_schema(eval, allow_float_scores=False) 74 75 super().__init__( 76 name="GEval Task", 77 parent=tmp_project, 78 instruction=system_instruction, 79 thinking_instruction=cot_instructions, 80 output_json_schema=output_schema, 81 ) 82 83 84class GEval(BaseEval): 85 """ 86 A evaluator which implements G-Eval and LLM as Judge. 87 88 G-Eval is a method of evaluating the quality of a model's output. It is a weighted average of the scores of the tokens in the output. The weights are the log probabilities of the tokens in the output. https://arxiv.org/abs/2303.16634 89 90 LLM as Judge is a method of evaluating the quality of a model's output. It simply asks the LLM to score, and uses the returned output (no logprobs needed). Also called direct evaluation. 91 92 @misc{liu2023gevalnlgevaluationusing, 93 title={G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment}, 94 author={Yang Liu and Dan Iter and Yichong Xu and Shuohang Wang and Ruochen Xu and Chenguang Zhu}, 95 year={2023}, 96 eprint={2303.16634}, 97 archivePrefix={arXiv}, 98 primaryClass={cs.CL}, 99 url={https://arxiv.org/abs/2303.16634}, 100 } 101 """ 102 103 def __init__( 104 self, 105 eval_config: EvalConfig, 106 run_config: RunConfigProperties | None, 107 skills: SkillsDict | None = None, 108 ): 109 if ( 110 eval_config.config_type != EvalConfigType.g_eval 111 and eval_config.config_type != EvalConfigType.llm_as_judge 112 ): 113 raise ValueError( 114 f"GEval must be initialized with a GEval or LLM as Judge config_type. Got {eval_config.config_type}" 115 ) 116 117 super().__init__(eval_config, run_config, skills=skills) 118 119 self.geval_task = GEvalTask(eval_config) 120 121 def generate_final_answer_run_description( 122 self, eval_input: str, eval_output: str 123 ) -> str: 124 return f"""The model was given the following input for the task: 125<eval_data> 126{eval_input} 127</eval_data> 128 129The model produced the following output for the task: 130<eval_data> 131{eval_output} 132</eval_data> 133""" 134 135 def generate_ref_ans_run_description( 136 self, eval_input: str, eval_output: str, reference_answer: str 137 ) -> str: 138 return f"""The model was given the following input for the task: 139<eval_data> 140{eval_input} 141</eval_data> 142 143The model produced the following output for the task: 144<eval_data> 145{eval_output} 146</eval_data> 147 148This is the reference answer: 149<eval_data> 150{reference_answer} 151</eval_data> 152""" 153 154 def generate_full_trace_run_description( 155 self, 156 eval_input: str, 157 available_tools: str | None, 158 conversation_history: str, 159 ) -> str: 160 description = "" 161 description += f"""The model was given the following <user_input> for the <task_description>: 162<eval_data> 163<user_input>{eval_input}</user_input> 164</eval_data> 165""" 166 # Get properties from spec if available, otherwise from eval.template_properties (for legacy evals) 167 spec = self.eval.associated_spec(readonly=True) 168 169 # Spec uses different keys than legacy eval template_properties 170 if spec: 171 # Spec: tool_use_guidelines, appropriate_tool_use_examples, inappropriate_tool_use_examples 172 tool_use_guidelines = str(spec.properties.get("tool_use_guidelines") or "") 173 appropriate_tool_use_examples = str( 174 spec.properties.get("appropriate_tool_use_examples") or "" 175 ) 176 inappropriate_tool_use_examples = str( 177 spec.properties.get("inappropriate_tool_use_examples") or "" 178 ) 179 description += f"""The model was given the following <tool_use_guidelines>: 180<eval_data> 181<tool_use_guidelines> 182{tool_use_guidelines} 183</tool_use_guidelines> 184</eval_data> 185""" 186 description += f"""The model was given the following <appropriate_tool_use_examples>: 187<eval_data> 188<appropriate_tool_use_examples> 189{appropriate_tool_use_examples} 190</appropriate_tool_use_examples> 191</eval_data> 192""" 193 description += f"""The model was given the following <inappropriate_tool_use_examples>: 194<eval_data> 195<inappropriate_tool_use_examples> 196{inappropriate_tool_use_examples} 197</inappropriate_tool_use_examples> 198</eval_data> 199""" 200 elif self.eval.template_properties: 201 # Legacy eval: appropriate_tool_use_guidelines, inappropriate_tool_use_guidelines 202 appropriate_tool_use_guidelines = str( 203 self.eval.template_properties.get("appropriate_tool_use_guidelines") 204 or "" 205 ) 206 inappropriate_tool_use_guidelines = str( 207 self.eval.template_properties.get("inappropriate_tool_use_guidelines") 208 or "" 209 ) 210 211 description += f"""The model was given the following <appropriate_tool_use_guidelines> guidelines: 212<eval_data> 213<appropriate_tool_use_guidelines> 214{appropriate_tool_use_guidelines} 215</appropriate_tool_use_guidelines> 216</eval_data> 217""" 218 # Only include if it has content since it is optional 219 if inappropriate_tool_use_guidelines: 220 description += f"""The model was given the following <inappropriate_tool_use_guidelines> guidelines: 221<eval_data> 222<inappropriate_tool_use_guidelines> 223{inappropriate_tool_use_guidelines} 224</inappropriate_tool_use_guidelines> 225</eval_data> 226""" 227 228 if available_tools is not None: 229 if available_tools != "": 230 description += f""" 231This is the list of tools available to the model: 232<eval_data> 233<available_tools>{available_tools}</available_tools> 234</eval_data> 235""" 236 else: 237 description += """ 238There were no tools available to the model. 239""" 240 241 description += f""" 242This is the full conversation history for the task run: 243<eval_data> 244<conversation_history>{conversation_history}</conversation_history> 245</eval_data> 246""" 247 return description 248 249 async def run_eval( 250 self, task_run: TaskRun, eval_job_item: TaskRun | None = None 251 ) -> tuple[EvalScores, Dict[str, str] | None]: 252 """ 253 Run this eval on the given task run. 254 """ 255 256 model_name, provider = self.model_and_provider() 257 258 # Only fetch logprobs for G-Eval 259 # There are at most 5 valid rating tokens per rating type (five_star being largest), so 10 is more than enough to get to the very very unlikely 260 top_logprobs = ( 261 10 if self.eval_config.config_type == EvalConfigType.g_eval else None 262 ) 263 264 # We don't expose setting this manually in the UI, so pull a recommended mode from ml_model_list 265 structured_output_mode = default_structured_output_mode_for_model_provider( 266 model_name, 267 provider, 268 default=StructuredOutputMode.json_schema, 269 # G-eval expects JSON, so don't allow function calling modes 270 disallowed_modes=[ 271 StructuredOutputMode.function_calling, 272 StructuredOutputMode.function_calling_weak, 273 ], 274 ) 275 276 adapter = adapter_for_task( 277 self.geval_task, 278 run_config_properties=KilnAgentRunConfigProperties( 279 model_name=model_name, 280 model_provider_name=provider, 281 # We always use Simple COT for G-Eval and LLM as Judge 282 prompt_id=PromptGenerators.SIMPLE_CHAIN_OF_THOUGHT, 283 structured_output_mode=structured_output_mode, 284 ), 285 base_adapter_config=AdapterConfig( 286 # Don't save this run into the task_runs. It will be saved into an eval_run where it belongs 287 allow_saving=False, 288 top_logprobs=top_logprobs, 289 ), 290 ) 291 292 if self.eval.evaluation_data_type == EvalDataType.full_trace: 293 if task_run.trace is None: 294 raise ValueError("Task run trace is required for full trace evaluation") 295 296 available_tools = await EvalUtils.formatted_available_tools_from_task_run( 297 task_run 298 ) 299 run_description = self.generate_full_trace_run_description( 300 task_run.input, 301 available_tools, 302 EvalTraceFormatter.trace_to_formatted_conversation_history( 303 task_run.trace 304 ), 305 ) 306 307 elif self.eval.evaluation_data_type == EvalDataType.reference_answer: 308 if eval_job_item is None: 309 raise ValueError( 310 "Eval job item is required for reference answer evaluation" 311 ) 312 run_description = self.generate_ref_ans_run_description( 313 task_run.input, task_run.output.output, eval_job_item.output.output 314 ) 315 316 else: # EvalDataType.final_answer 317 run_description = self.generate_final_answer_run_description( 318 task_run.input, task_run.output.output 319 ) 320 321 # We don't need the run, but invoke_returning_run_output() runs validations for us over _run() 322 _, run_output = await adapter.invoke_returning_run_output(run_description) 323 324 if self.eval_config.config_type == EvalConfigType.llm_as_judge: 325 return self.build_llm_as_judge_score( 326 run_output 327 ), run_output.intermediate_outputs 328 else: 329 return self.build_g_eval_score(run_output), run_output.intermediate_outputs 330 331 def build_llm_as_judge_score(self, run_output: RunOutput) -> EvalScores: 332 """ 333 Build the LLM as Judge score for the given run and run output. 334 """ 335 # Convert the output format we asked for (discreet values) to our float scores 336 scores: EvalScores = {} 337 if not isinstance(run_output.output, dict): 338 raise ValueError("LLM as Judge output must be a dictionary") 339 340 for metric, score in run_output.output.items(): 341 token_score = self.score_from_token_string(f"{score}") 342 if token_score is None: 343 raise ValueError( 344 f"No score found for metric: {metric}. The LLM failed to follow the scoring rubric/instructions/schema." 345 ) 346 scores[metric] = token_score 347 return scores 348 349 def build_g_eval_score(self, run_output: RunOutput) -> EvalScores: 350 """ 351 Build the G-Eval score for the given run and run output. 352 353 We create a weighted average of each rating using the logprobs. 354 355 @misc{liu2023gevalnlgevaluationusing, 356 title={G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment}, 357 author={Yang Liu and Dan Iter and Yichong Xu and Shuohang Wang and Ruochen Xu and Chenguang Zhu}, 358 year={2023}, 359 eprint={2303.16634}, 360 archivePrefix={arXiv}, 361 primaryClass={cs.CL}, 362 url={https://arxiv.org/abs/2303.16634}, 363 } 364 """ 365 # We use structured output 366 outputs = run_output.output 367 assert isinstance(outputs, dict) 368 369 # Build raw string output from the logprobs, which is easier to work with than Dict for the next bit 370 raw_output = self.raw_output_from_logprobs(run_output) 371 372 # find the offset the start of each metric in the raw output json 373 metrics: List[str] = list(outputs.keys()) 374 metric_offsets = self.metric_offsets(raw_output, metrics) 375 376 final_scores: EvalScores = {} 377 for metric in metrics: 378 score = self.g_eval_single_metric( 379 run_output, metric, metric_offsets, raw_output 380 ) 381 if score is None: 382 raise ValueError( 383 f"No score found for metric: {metric}. The LLM failed to follow the scoring rubric/instructions/schema." 384 ) 385 final_scores[metric] = score 386 387 return final_scores 388 389 def g_eval_single_metric( 390 self, 391 run_output: RunOutput, 392 metric: str, 393 metric_offsets: Dict[str, int], 394 raw_output: str, 395 ) -> float | None: 396 """ 397 Run the G-Eval for a single metric. 398 399 Scan the logprobs for the metric and return the weighted score of the rating token. 400 """ 401 402 start_offset, end_offset = self.token_search_range( 403 raw_output, metric, metric_offsets 404 ) 405 406 offset = 0 407 408 if ( 409 run_output.output_logprobs is None 410 or run_output.output_logprobs.content is None 411 ): 412 raise RuntimeError( 413 "No logprobs found for output - can not calculate g-eval" 414 ) 415 416 # scan the tokens in the range, looking for the rating token 417 for _, chat_logprob in enumerate(run_output.output_logprobs.content): 418 if offset >= end_offset: 419 break 420 if offset >= start_offset: 421 score = self.rating_token_to_score(chat_logprob) 422 if score is not None: 423 return score 424 offset += len(chat_logprob.token) 425 426 return None 427 428 def raw_output_from_logprobs(self, run_output: RunOutput) -> str: 429 """ 430 Build the raw output string from the logprobs. Generate from logprobs so it's guaranteed to match the logprobs offsets 431 """ 432 if ( 433 run_output.output_logprobs is None 434 or run_output.output_logprobs.content is None 435 ): 436 raise RuntimeError( 437 "No logprobs found for output - can not calculate g-eval" 438 ) 439 440 raw = "" 441 for chat_logprob in run_output.output_logprobs.content: 442 raw += chat_logprob.token 443 return raw 444 445 def token_search_range( 446 self, raw_output: str, metric: str, metric_offsets: Dict[str, int] 447 ) -> Tuple[int, int]: 448 """ 449 Find the start and end offsets of the metric in the raw output. 450 451 Start searching after the end of the target metric json entry ("overall_rating":), and before the start of the next metric ("some_other_score"). 452 """ 453 start_offset = metric_offsets[metric] + len(metric) 454 455 # Find the lowest end offset that is greater than the start offset 456 end_offset = len(raw_output) 457 for v in list(metric_offsets.values()): 458 if v < end_offset and v > start_offset: 459 end_offset = v 460 461 return start_offset, end_offset 462 463 def rating_token_to_score( 464 self, token_logprob: ChatCompletionTokenLogprob 465 ) -> float | None: 466 """ 467 Convert a rating token to a score using weighted average of top logprobs. 468 469 Only includes tokens that have valid scores. 470 471 Some cleanup for upper case, whitespace and quotes. LLMs aren't always consistent. 472 """ 473 primary_token_score = self.score_from_token_string(token_logprob.token) 474 # check this is a real rating token, it could just be the ": ", "," or whitespace 475 if primary_token_score is None: 476 return None 477 478 total_score = 0.0 479 total_probability = 0.0 480 top_logprobs_contains_primary_token = False 481 482 # Process all valid scoring tokens from alternatives 483 for top_logprob in token_logprob.top_logprobs: 484 if top_logprob.token == token_logprob.token: 485 top_logprobs_contains_primary_token = True 486 token_score = self.score_from_token_string(top_logprob.token) 487 if token_score is not None: 488 # Convert logprob to probability 489 probability = math.exp(top_logprob.logprob) 490 total_score += token_score * probability 491 total_probability += probability 492 493 # Weird OpenAI 4o bug - sometimes the primary token is included in the top logprobs, sometimes not. 494 # Add the primary token back in if excluded 495 if not top_logprobs_contains_primary_token: 496 if token_logprob.logprob == -9999.0: 497 # Another "bug" - sometimes the logprob is -9999.0. This seems to happen when the rest of the logprobs are tiny probability. 498 total_score += primary_token_score * 1.0 499 total_probability += 1.0 500 else: 501 probability = math.exp(token_logprob.logprob) 502 total_score += primary_token_score * probability 503 total_probability += probability 504 505 if total_probability <= 0.0: 506 raise RuntimeError( 507 f"No valid scoring tokens found for {token_logprob.token}. This should never happen as the token has a valid score (so it must be excluded from top logprobs). Please file a bug if you see this." 508 ) 509 510 # Normalize by total probability of valid tokens (LLM may have wanted to generate other non-rating tokens, these shouldn't lower score of rating tokens) 511 weighted_score = total_score / total_probability 512 513 return weighted_score 514 515 def score_from_token_string(self, token: str) -> float | None: 516 if token in TOKEN_TO_SCORE_MAP: 517 return TOKEN_TO_SCORE_MAP[token] 518 519 # handle more token variations like '"1"' and '"pass"' and ' paSS' and 'PASS' 520 unquoted_token = token.strip().strip('"').lower() 521 if unquoted_token in TOKEN_TO_SCORE_MAP: 522 return TOKEN_TO_SCORE_MAP[unquoted_token] 523 524 # handle numeric tokens like "1.0" 525 try: 526 float_value = float(token) 527 if float_value.is_integer(): 528 str_token = str(int(float_value)) 529 if str_token in TOKEN_TO_SCORE_MAP: 530 return TOKEN_TO_SCORE_MAP[str_token] 531 except ValueError: 532 pass 533 534 return None 535 536 def metric_offsets(self, raw_output: str, metrics: List[str]) -> Dict[str, int]: 537 """ 538 Find the offset to the start of each metric in the raw output json 539 540 For the example json: `{"overall_rating": 1}` == 1 541 542 should return: 543 { 544 "overall_rating": 1 # it's 1 character into the json string 545 } 546 """ 547 metric_offsets: Dict[str, int] = {} 548 for metric in metrics: 549 # the quoted metric name is expected in the json: `{"overall_rating": 1}` == 1 550 metric_name = f'"{metric}"' 551 552 # we expect it exactly once 553 count = raw_output.count(metric_name) 554 if count != 1: 555 raise ValueError( 556 f"Metric {metric} should appear exactly once in the output. Found {count} times" 557 ) 558 559 offset = raw_output.find(metric_name) 560 if offset == -1: 561 raise ValueError(f"Metric {metric} not found in raw output") 562 metric_offsets[metric] = offset 563 return metric_offsets
38class GEvalTask(Task, parent_of={}): 39 """ 40 Kiln task for executing a G-Eval. Can be run on any Kiln adapter which supports logprobs. 41 42 Note G-Eval implements both G-Eval and LLM as Judge as they are very similar. 43 """ 44 45 def __init__(self, eval_config: EvalConfig): 46 tmp_project = Project(name="GEval") 47 48 # Build a simple LLM as Judge system instruction 49 system_instruction = "Your job to evaluate a model's performance on a task. Blocks will be marked with <eval_data> tags.\n" 50 # Optionally add a short task description 51 task_description = eval_config.properties.get("task_description", None) 52 if task_description: 53 system_instruction += f"\nThe task the model was given is as follows:\n<eval_data>\n<task_description>{task_description}</task_description>\n</eval_data>\n" 54 55 # Build the COT eval instructions 56 steps = eval_config.properties.get("eval_steps", []) 57 if not isinstance(steps, list): 58 raise ValueError("eval_steps must be a list.") 59 if len(steps) == 1: 60 cot_instructions = "First, think step by step about the model's performance following this evaluation step:\n\n" 61 cot_instructions += f"{steps[0]}\n" 62 else: 63 cot_instructions = "First, think step by step about the model's performance following these evaluation steps:\n\n" 64 for i, step in enumerate(steps): 65 cot_instructions += f"{i + 1}) {step}\n" 66 67 eval = eval_config.parent_eval() 68 if not eval: 69 raise ValueError("Eval config must have a parent eval") 70 71 # Build the output schema from the eval's target output scores. 72 # We restrict the LLM's output scoring schema to discrete scores (pass/fail/critical/1-5) - allow_float_scores=False 73 # However, the final scores from the evaluator can be a float (see later logprob calculation, which requires discrete token outputs) 74 output_schema = BaseEval.build_score_schema(eval, allow_float_scores=False) 75 76 super().__init__( 77 name="GEval Task", 78 parent=tmp_project, 79 instruction=system_instruction, 80 thinking_instruction=cot_instructions, 81 output_json_schema=output_schema, 82 )
Kiln task for executing a G-Eval. Can be run on any Kiln adapter which supports logprobs.
Note G-Eval implements both G-Eval and LLM as Judge as they are very similar.
45 def __init__(self, eval_config: EvalConfig): 46 tmp_project = Project(name="GEval") 47 48 # Build a simple LLM as Judge system instruction 49 system_instruction = "Your job to evaluate a model's performance on a task. Blocks will be marked with <eval_data> tags.\n" 50 # Optionally add a short task description 51 task_description = eval_config.properties.get("task_description", None) 52 if task_description: 53 system_instruction += f"\nThe task the model was given is as follows:\n<eval_data>\n<task_description>{task_description}</task_description>\n</eval_data>\n" 54 55 # Build the COT eval instructions 56 steps = eval_config.properties.get("eval_steps", []) 57 if not isinstance(steps, list): 58 raise ValueError("eval_steps must be a list.") 59 if len(steps) == 1: 60 cot_instructions = "First, think step by step about the model's performance following this evaluation step:\n\n" 61 cot_instructions += f"{steps[0]}\n" 62 else: 63 cot_instructions = "First, think step by step about the model's performance following these evaluation steps:\n\n" 64 for i, step in enumerate(steps): 65 cot_instructions += f"{i + 1}) {step}\n" 66 67 eval = eval_config.parent_eval() 68 if not eval: 69 raise ValueError("Eval config must have a parent eval") 70 71 # Build the output schema from the eval's target output scores. 72 # We restrict the LLM's output scoring schema to discrete scores (pass/fail/critical/1-5) - allow_float_scores=False 73 # However, the final scores from the evaluator can be a float (see later logprob calculation, which requires discrete token outputs) 74 output_schema = BaseEval.build_score_schema(eval, allow_float_scores=False) 75 76 super().__init__( 77 name="GEval Task", 78 parent=tmp_project, 79 instruction=system_instruction, 80 thinking_instruction=cot_instructions, 81 output_json_schema=output_schema, 82 )
Create a new model by parsing and validating input data from keyword arguments.
Raises [ValidationError][pydantic_core.ValidationError] if the input data cannot be
validated to form a valid model.
self is explicitly positional-only to allow self as a field name.
Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].
337def init_private_attributes(self: BaseModel, context: Any, /) -> None: 338 """This function is meant to behave like a BaseModel method to initialise private attributes. 339 340 It takes context as an argument since that's what pydantic-core passes when calling it. 341 342 Args: 343 self: The BaseModel instance. 344 context: The context. 345 """ 346 if getattr(self, '__pydantic_private__', None) is None: 347 pydantic_private = {} 348 for name, private_attr in self.__private_attributes__.items(): 349 default = private_attr.get_default() 350 if default is not PydanticUndefined: 351 pydantic_private[name] = default 352 object_setattr(self, '__pydantic_private__', pydantic_private)
This function is meant to behave like a BaseModel method to initialise private attributes.
It takes context as an argument since that's what pydantic-core passes when calling it.
Args: self: The BaseModel instance. context: The context.
85class GEval(BaseEval): 86 """ 87 A evaluator which implements G-Eval and LLM as Judge. 88 89 G-Eval is a method of evaluating the quality of a model's output. It is a weighted average of the scores of the tokens in the output. The weights are the log probabilities of the tokens in the output. https://arxiv.org/abs/2303.16634 90 91 LLM as Judge is a method of evaluating the quality of a model's output. It simply asks the LLM to score, and uses the returned output (no logprobs needed). Also called direct evaluation. 92 93 @misc{liu2023gevalnlgevaluationusing, 94 title={G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment}, 95 author={Yang Liu and Dan Iter and Yichong Xu and Shuohang Wang and Ruochen Xu and Chenguang Zhu}, 96 year={2023}, 97 eprint={2303.16634}, 98 archivePrefix={arXiv}, 99 primaryClass={cs.CL}, 100 url={https://arxiv.org/abs/2303.16634}, 101 } 102 """ 103 104 def __init__( 105 self, 106 eval_config: EvalConfig, 107 run_config: RunConfigProperties | None, 108 skills: SkillsDict | None = None, 109 ): 110 if ( 111 eval_config.config_type != EvalConfigType.g_eval 112 and eval_config.config_type != EvalConfigType.llm_as_judge 113 ): 114 raise ValueError( 115 f"GEval must be initialized with a GEval or LLM as Judge config_type. Got {eval_config.config_type}" 116 ) 117 118 super().__init__(eval_config, run_config, skills=skills) 119 120 self.geval_task = GEvalTask(eval_config) 121 122 def generate_final_answer_run_description( 123 self, eval_input: str, eval_output: str 124 ) -> str: 125 return f"""The model was given the following input for the task: 126<eval_data> 127{eval_input} 128</eval_data> 129 130The model produced the following output for the task: 131<eval_data> 132{eval_output} 133</eval_data> 134""" 135 136 def generate_ref_ans_run_description( 137 self, eval_input: str, eval_output: str, reference_answer: str 138 ) -> str: 139 return f"""The model was given the following input for the task: 140<eval_data> 141{eval_input} 142</eval_data> 143 144The model produced the following output for the task: 145<eval_data> 146{eval_output} 147</eval_data> 148 149This is the reference answer: 150<eval_data> 151{reference_answer} 152</eval_data> 153""" 154 155 def generate_full_trace_run_description( 156 self, 157 eval_input: str, 158 available_tools: str | None, 159 conversation_history: str, 160 ) -> str: 161 description = "" 162 description += f"""The model was given the following <user_input> for the <task_description>: 163<eval_data> 164<user_input>{eval_input}</user_input> 165</eval_data> 166""" 167 # Get properties from spec if available, otherwise from eval.template_properties (for legacy evals) 168 spec = self.eval.associated_spec(readonly=True) 169 170 # Spec uses different keys than legacy eval template_properties 171 if spec: 172 # Spec: tool_use_guidelines, appropriate_tool_use_examples, inappropriate_tool_use_examples 173 tool_use_guidelines = str(spec.properties.get("tool_use_guidelines") or "") 174 appropriate_tool_use_examples = str( 175 spec.properties.get("appropriate_tool_use_examples") or "" 176 ) 177 inappropriate_tool_use_examples = str( 178 spec.properties.get("inappropriate_tool_use_examples") or "" 179 ) 180 description += f"""The model was given the following <tool_use_guidelines>: 181<eval_data> 182<tool_use_guidelines> 183{tool_use_guidelines} 184</tool_use_guidelines> 185</eval_data> 186""" 187 description += f"""The model was given the following <appropriate_tool_use_examples>: 188<eval_data> 189<appropriate_tool_use_examples> 190{appropriate_tool_use_examples} 191</appropriate_tool_use_examples> 192</eval_data> 193""" 194 description += f"""The model was given the following <inappropriate_tool_use_examples>: 195<eval_data> 196<inappropriate_tool_use_examples> 197{inappropriate_tool_use_examples} 198</inappropriate_tool_use_examples> 199</eval_data> 200""" 201 elif self.eval.template_properties: 202 # Legacy eval: appropriate_tool_use_guidelines, inappropriate_tool_use_guidelines 203 appropriate_tool_use_guidelines = str( 204 self.eval.template_properties.get("appropriate_tool_use_guidelines") 205 or "" 206 ) 207 inappropriate_tool_use_guidelines = str( 208 self.eval.template_properties.get("inappropriate_tool_use_guidelines") 209 or "" 210 ) 211 212 description += f"""The model was given the following <appropriate_tool_use_guidelines> guidelines: 213<eval_data> 214<appropriate_tool_use_guidelines> 215{appropriate_tool_use_guidelines} 216</appropriate_tool_use_guidelines> 217</eval_data> 218""" 219 # Only include if it has content since it is optional 220 if inappropriate_tool_use_guidelines: 221 description += f"""The model was given the following <inappropriate_tool_use_guidelines> guidelines: 222<eval_data> 223<inappropriate_tool_use_guidelines> 224{inappropriate_tool_use_guidelines} 225</inappropriate_tool_use_guidelines> 226</eval_data> 227""" 228 229 if available_tools is not None: 230 if available_tools != "": 231 description += f""" 232This is the list of tools available to the model: 233<eval_data> 234<available_tools>{available_tools}</available_tools> 235</eval_data> 236""" 237 else: 238 description += """ 239There were no tools available to the model. 240""" 241 242 description += f""" 243This is the full conversation history for the task run: 244<eval_data> 245<conversation_history>{conversation_history}</conversation_history> 246</eval_data> 247""" 248 return description 249 250 async def run_eval( 251 self, task_run: TaskRun, eval_job_item: TaskRun | None = None 252 ) -> tuple[EvalScores, Dict[str, str] | None]: 253 """ 254 Run this eval on the given task run. 255 """ 256 257 model_name, provider = self.model_and_provider() 258 259 # Only fetch logprobs for G-Eval 260 # There are at most 5 valid rating tokens per rating type (five_star being largest), so 10 is more than enough to get to the very very unlikely 261 top_logprobs = ( 262 10 if self.eval_config.config_type == EvalConfigType.g_eval else None 263 ) 264 265 # We don't expose setting this manually in the UI, so pull a recommended mode from ml_model_list 266 structured_output_mode = default_structured_output_mode_for_model_provider( 267 model_name, 268 provider, 269 default=StructuredOutputMode.json_schema, 270 # G-eval expects JSON, so don't allow function calling modes 271 disallowed_modes=[ 272 StructuredOutputMode.function_calling, 273 StructuredOutputMode.function_calling_weak, 274 ], 275 ) 276 277 adapter = adapter_for_task( 278 self.geval_task, 279 run_config_properties=KilnAgentRunConfigProperties( 280 model_name=model_name, 281 model_provider_name=provider, 282 # We always use Simple COT for G-Eval and LLM as Judge 283 prompt_id=PromptGenerators.SIMPLE_CHAIN_OF_THOUGHT, 284 structured_output_mode=structured_output_mode, 285 ), 286 base_adapter_config=AdapterConfig( 287 # Don't save this run into the task_runs. It will be saved into an eval_run where it belongs 288 allow_saving=False, 289 top_logprobs=top_logprobs, 290 ), 291 ) 292 293 if self.eval.evaluation_data_type == EvalDataType.full_trace: 294 if task_run.trace is None: 295 raise ValueError("Task run trace is required for full trace evaluation") 296 297 available_tools = await EvalUtils.formatted_available_tools_from_task_run( 298 task_run 299 ) 300 run_description = self.generate_full_trace_run_description( 301 task_run.input, 302 available_tools, 303 EvalTraceFormatter.trace_to_formatted_conversation_history( 304 task_run.trace 305 ), 306 ) 307 308 elif self.eval.evaluation_data_type == EvalDataType.reference_answer: 309 if eval_job_item is None: 310 raise ValueError( 311 "Eval job item is required for reference answer evaluation" 312 ) 313 run_description = self.generate_ref_ans_run_description( 314 task_run.input, task_run.output.output, eval_job_item.output.output 315 ) 316 317 else: # EvalDataType.final_answer 318 run_description = self.generate_final_answer_run_description( 319 task_run.input, task_run.output.output 320 ) 321 322 # We don't need the run, but invoke_returning_run_output() runs validations for us over _run() 323 _, run_output = await adapter.invoke_returning_run_output(run_description) 324 325 if self.eval_config.config_type == EvalConfigType.llm_as_judge: 326 return self.build_llm_as_judge_score( 327 run_output 328 ), run_output.intermediate_outputs 329 else: 330 return self.build_g_eval_score(run_output), run_output.intermediate_outputs 331 332 def build_llm_as_judge_score(self, run_output: RunOutput) -> EvalScores: 333 """ 334 Build the LLM as Judge score for the given run and run output. 335 """ 336 # Convert the output format we asked for (discreet values) to our float scores 337 scores: EvalScores = {} 338 if not isinstance(run_output.output, dict): 339 raise ValueError("LLM as Judge output must be a dictionary") 340 341 for metric, score in run_output.output.items(): 342 token_score = self.score_from_token_string(f"{score}") 343 if token_score is None: 344 raise ValueError( 345 f"No score found for metric: {metric}. The LLM failed to follow the scoring rubric/instructions/schema." 346 ) 347 scores[metric] = token_score 348 return scores 349 350 def build_g_eval_score(self, run_output: RunOutput) -> EvalScores: 351 """ 352 Build the G-Eval score for the given run and run output. 353 354 We create a weighted average of each rating using the logprobs. 355 356 @misc{liu2023gevalnlgevaluationusing, 357 title={G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment}, 358 author={Yang Liu and Dan Iter and Yichong Xu and Shuohang Wang and Ruochen Xu and Chenguang Zhu}, 359 year={2023}, 360 eprint={2303.16634}, 361 archivePrefix={arXiv}, 362 primaryClass={cs.CL}, 363 url={https://arxiv.org/abs/2303.16634}, 364 } 365 """ 366 # We use structured output 367 outputs = run_output.output 368 assert isinstance(outputs, dict) 369 370 # Build raw string output from the logprobs, which is easier to work with than Dict for the next bit 371 raw_output = self.raw_output_from_logprobs(run_output) 372 373 # find the offset the start of each metric in the raw output json 374 metrics: List[str] = list(outputs.keys()) 375 metric_offsets = self.metric_offsets(raw_output, metrics) 376 377 final_scores: EvalScores = {} 378 for metric in metrics: 379 score = self.g_eval_single_metric( 380 run_output, metric, metric_offsets, raw_output 381 ) 382 if score is None: 383 raise ValueError( 384 f"No score found for metric: {metric}. The LLM failed to follow the scoring rubric/instructions/schema." 385 ) 386 final_scores[metric] = score 387 388 return final_scores 389 390 def g_eval_single_metric( 391 self, 392 run_output: RunOutput, 393 metric: str, 394 metric_offsets: Dict[str, int], 395 raw_output: str, 396 ) -> float | None: 397 """ 398 Run the G-Eval for a single metric. 399 400 Scan the logprobs for the metric and return the weighted score of the rating token. 401 """ 402 403 start_offset, end_offset = self.token_search_range( 404 raw_output, metric, metric_offsets 405 ) 406 407 offset = 0 408 409 if ( 410 run_output.output_logprobs is None 411 or run_output.output_logprobs.content is None 412 ): 413 raise RuntimeError( 414 "No logprobs found for output - can not calculate g-eval" 415 ) 416 417 # scan the tokens in the range, looking for the rating token 418 for _, chat_logprob in enumerate(run_output.output_logprobs.content): 419 if offset >= end_offset: 420 break 421 if offset >= start_offset: 422 score = self.rating_token_to_score(chat_logprob) 423 if score is not None: 424 return score 425 offset += len(chat_logprob.token) 426 427 return None 428 429 def raw_output_from_logprobs(self, run_output: RunOutput) -> str: 430 """ 431 Build the raw output string from the logprobs. Generate from logprobs so it's guaranteed to match the logprobs offsets 432 """ 433 if ( 434 run_output.output_logprobs is None 435 or run_output.output_logprobs.content is None 436 ): 437 raise RuntimeError( 438 "No logprobs found for output - can not calculate g-eval" 439 ) 440 441 raw = "" 442 for chat_logprob in run_output.output_logprobs.content: 443 raw += chat_logprob.token 444 return raw 445 446 def token_search_range( 447 self, raw_output: str, metric: str, metric_offsets: Dict[str, int] 448 ) -> Tuple[int, int]: 449 """ 450 Find the start and end offsets of the metric in the raw output. 451 452 Start searching after the end of the target metric json entry ("overall_rating":), and before the start of the next metric ("some_other_score"). 453 """ 454 start_offset = metric_offsets[metric] + len(metric) 455 456 # Find the lowest end offset that is greater than the start offset 457 end_offset = len(raw_output) 458 for v in list(metric_offsets.values()): 459 if v < end_offset and v > start_offset: 460 end_offset = v 461 462 return start_offset, end_offset 463 464 def rating_token_to_score( 465 self, token_logprob: ChatCompletionTokenLogprob 466 ) -> float | None: 467 """ 468 Convert a rating token to a score using weighted average of top logprobs. 469 470 Only includes tokens that have valid scores. 471 472 Some cleanup for upper case, whitespace and quotes. LLMs aren't always consistent. 473 """ 474 primary_token_score = self.score_from_token_string(token_logprob.token) 475 # check this is a real rating token, it could just be the ": ", "," or whitespace 476 if primary_token_score is None: 477 return None 478 479 total_score = 0.0 480 total_probability = 0.0 481 top_logprobs_contains_primary_token = False 482 483 # Process all valid scoring tokens from alternatives 484 for top_logprob in token_logprob.top_logprobs: 485 if top_logprob.token == token_logprob.token: 486 top_logprobs_contains_primary_token = True 487 token_score = self.score_from_token_string(top_logprob.token) 488 if token_score is not None: 489 # Convert logprob to probability 490 probability = math.exp(top_logprob.logprob) 491 total_score += token_score * probability 492 total_probability += probability 493 494 # Weird OpenAI 4o bug - sometimes the primary token is included in the top logprobs, sometimes not. 495 # Add the primary token back in if excluded 496 if not top_logprobs_contains_primary_token: 497 if token_logprob.logprob == -9999.0: 498 # Another "bug" - sometimes the logprob is -9999.0. This seems to happen when the rest of the logprobs are tiny probability. 499 total_score += primary_token_score * 1.0 500 total_probability += 1.0 501 else: 502 probability = math.exp(token_logprob.logprob) 503 total_score += primary_token_score * probability 504 total_probability += probability 505 506 if total_probability <= 0.0: 507 raise RuntimeError( 508 f"No valid scoring tokens found for {token_logprob.token}. This should never happen as the token has a valid score (so it must be excluded from top logprobs). Please file a bug if you see this." 509 ) 510 511 # Normalize by total probability of valid tokens (LLM may have wanted to generate other non-rating tokens, these shouldn't lower score of rating tokens) 512 weighted_score = total_score / total_probability 513 514 return weighted_score 515 516 def score_from_token_string(self, token: str) -> float | None: 517 if token in TOKEN_TO_SCORE_MAP: 518 return TOKEN_TO_SCORE_MAP[token] 519 520 # handle more token variations like '"1"' and '"pass"' and ' paSS' and 'PASS' 521 unquoted_token = token.strip().strip('"').lower() 522 if unquoted_token in TOKEN_TO_SCORE_MAP: 523 return TOKEN_TO_SCORE_MAP[unquoted_token] 524 525 # handle numeric tokens like "1.0" 526 try: 527 float_value = float(token) 528 if float_value.is_integer(): 529 str_token = str(int(float_value)) 530 if str_token in TOKEN_TO_SCORE_MAP: 531 return TOKEN_TO_SCORE_MAP[str_token] 532 except ValueError: 533 pass 534 535 return None 536 537 def metric_offsets(self, raw_output: str, metrics: List[str]) -> Dict[str, int]: 538 """ 539 Find the offset to the start of each metric in the raw output json 540 541 For the example json: `{"overall_rating": 1}` == 1 542 543 should return: 544 { 545 "overall_rating": 1 # it's 1 character into the json string 546 } 547 """ 548 metric_offsets: Dict[str, int] = {} 549 for metric in metrics: 550 # the quoted metric name is expected in the json: `{"overall_rating": 1}` == 1 551 metric_name = f'"{metric}"' 552 553 # we expect it exactly once 554 count = raw_output.count(metric_name) 555 if count != 1: 556 raise ValueError( 557 f"Metric {metric} should appear exactly once in the output. Found {count} times" 558 ) 559 560 offset = raw_output.find(metric_name) 561 if offset == -1: 562 raise ValueError(f"Metric {metric} not found in raw output") 563 metric_offsets[metric] = offset 564 return metric_offsets
A evaluator which implements G-Eval and LLM as Judge.
G-Eval is a method of evaluating the quality of a model's output. It is a weighted average of the scores of the tokens in the output. The weights are the log probabilities of the tokens in the output. https://arxiv.org/abs/2303.16634
LLM as Judge is a method of evaluating the quality of a model's output. It simply asks the LLM to score, and uses the returned output (no logprobs needed). Also called direct evaluation.
@misc{liu2023gevalnlgevaluationusing, title={G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment}, author={Yang Liu and Dan Iter and Yichong Xu and Shuohang Wang and Ruochen Xu and Chenguang Zhu}, year={2023}, eprint={2303.16634}, archivePrefix={arXiv}, primaryClass={cs.CL}, url={https://arxiv.org/abs/2303.16634}, }
104 def __init__( 105 self, 106 eval_config: EvalConfig, 107 run_config: RunConfigProperties | None, 108 skills: SkillsDict | None = None, 109 ): 110 if ( 111 eval_config.config_type != EvalConfigType.g_eval 112 and eval_config.config_type != EvalConfigType.llm_as_judge 113 ): 114 raise ValueError( 115 f"GEval must be initialized with a GEval or LLM as Judge config_type. Got {eval_config.config_type}" 116 ) 117 118 super().__init__(eval_config, run_config, skills=skills) 119 120 self.geval_task = GEvalTask(eval_config)
122 def generate_final_answer_run_description( 123 self, eval_input: str, eval_output: str 124 ) -> str: 125 return f"""The model was given the following input for the task: 126<eval_data> 127{eval_input} 128</eval_data> 129 130The model produced the following output for the task: 131<eval_data> 132{eval_output} 133</eval_data> 134"""
136 def generate_ref_ans_run_description( 137 self, eval_input: str, eval_output: str, reference_answer: str 138 ) -> str: 139 return f"""The model was given the following input for the task: 140<eval_data> 141{eval_input} 142</eval_data> 143 144The model produced the following output for the task: 145<eval_data> 146{eval_output} 147</eval_data> 148 149This is the reference answer: 150<eval_data> 151{reference_answer} 152</eval_data> 153"""
155 def generate_full_trace_run_description( 156 self, 157 eval_input: str, 158 available_tools: str | None, 159 conversation_history: str, 160 ) -> str: 161 description = "" 162 description += f"""The model was given the following <user_input> for the <task_description>: 163<eval_data> 164<user_input>{eval_input}</user_input> 165</eval_data> 166""" 167 # Get properties from spec if available, otherwise from eval.template_properties (for legacy evals) 168 spec = self.eval.associated_spec(readonly=True) 169 170 # Spec uses different keys than legacy eval template_properties 171 if spec: 172 # Spec: tool_use_guidelines, appropriate_tool_use_examples, inappropriate_tool_use_examples 173 tool_use_guidelines = str(spec.properties.get("tool_use_guidelines") or "") 174 appropriate_tool_use_examples = str( 175 spec.properties.get("appropriate_tool_use_examples") or "" 176 ) 177 inappropriate_tool_use_examples = str( 178 spec.properties.get("inappropriate_tool_use_examples") or "" 179 ) 180 description += f"""The model was given the following <tool_use_guidelines>: 181<eval_data> 182<tool_use_guidelines> 183{tool_use_guidelines} 184</tool_use_guidelines> 185</eval_data> 186""" 187 description += f"""The model was given the following <appropriate_tool_use_examples>: 188<eval_data> 189<appropriate_tool_use_examples> 190{appropriate_tool_use_examples} 191</appropriate_tool_use_examples> 192</eval_data> 193""" 194 description += f"""The model was given the following <inappropriate_tool_use_examples>: 195<eval_data> 196<inappropriate_tool_use_examples> 197{inappropriate_tool_use_examples} 198</inappropriate_tool_use_examples> 199</eval_data> 200""" 201 elif self.eval.template_properties: 202 # Legacy eval: appropriate_tool_use_guidelines, inappropriate_tool_use_guidelines 203 appropriate_tool_use_guidelines = str( 204 self.eval.template_properties.get("appropriate_tool_use_guidelines") 205 or "" 206 ) 207 inappropriate_tool_use_guidelines = str( 208 self.eval.template_properties.get("inappropriate_tool_use_guidelines") 209 or "" 210 ) 211 212 description += f"""The model was given the following <appropriate_tool_use_guidelines> guidelines: 213<eval_data> 214<appropriate_tool_use_guidelines> 215{appropriate_tool_use_guidelines} 216</appropriate_tool_use_guidelines> 217</eval_data> 218""" 219 # Only include if it has content since it is optional 220 if inappropriate_tool_use_guidelines: 221 description += f"""The model was given the following <inappropriate_tool_use_guidelines> guidelines: 222<eval_data> 223<inappropriate_tool_use_guidelines> 224{inappropriate_tool_use_guidelines} 225</inappropriate_tool_use_guidelines> 226</eval_data> 227""" 228 229 if available_tools is not None: 230 if available_tools != "": 231 description += f""" 232This is the list of tools available to the model: 233<eval_data> 234<available_tools>{available_tools}</available_tools> 235</eval_data> 236""" 237 else: 238 description += """ 239There were no tools available to the model. 240""" 241 242 description += f""" 243This is the full conversation history for the task run: 244<eval_data> 245<conversation_history>{conversation_history}</conversation_history> 246</eval_data> 247""" 248 return description
250 async def run_eval( 251 self, task_run: TaskRun, eval_job_item: TaskRun | None = None 252 ) -> tuple[EvalScores, Dict[str, str] | None]: 253 """ 254 Run this eval on the given task run. 255 """ 256 257 model_name, provider = self.model_and_provider() 258 259 # Only fetch logprobs for G-Eval 260 # There are at most 5 valid rating tokens per rating type (five_star being largest), so 10 is more than enough to get to the very very unlikely 261 top_logprobs = ( 262 10 if self.eval_config.config_type == EvalConfigType.g_eval else None 263 ) 264 265 # We don't expose setting this manually in the UI, so pull a recommended mode from ml_model_list 266 structured_output_mode = default_structured_output_mode_for_model_provider( 267 model_name, 268 provider, 269 default=StructuredOutputMode.json_schema, 270 # G-eval expects JSON, so don't allow function calling modes 271 disallowed_modes=[ 272 StructuredOutputMode.function_calling, 273 StructuredOutputMode.function_calling_weak, 274 ], 275 ) 276 277 adapter = adapter_for_task( 278 self.geval_task, 279 run_config_properties=KilnAgentRunConfigProperties( 280 model_name=model_name, 281 model_provider_name=provider, 282 # We always use Simple COT for G-Eval and LLM as Judge 283 prompt_id=PromptGenerators.SIMPLE_CHAIN_OF_THOUGHT, 284 structured_output_mode=structured_output_mode, 285 ), 286 base_adapter_config=AdapterConfig( 287 # Don't save this run into the task_runs. It will be saved into an eval_run where it belongs 288 allow_saving=False, 289 top_logprobs=top_logprobs, 290 ), 291 ) 292 293 if self.eval.evaluation_data_type == EvalDataType.full_trace: 294 if task_run.trace is None: 295 raise ValueError("Task run trace is required for full trace evaluation") 296 297 available_tools = await EvalUtils.formatted_available_tools_from_task_run( 298 task_run 299 ) 300 run_description = self.generate_full_trace_run_description( 301 task_run.input, 302 available_tools, 303 EvalTraceFormatter.trace_to_formatted_conversation_history( 304 task_run.trace 305 ), 306 ) 307 308 elif self.eval.evaluation_data_type == EvalDataType.reference_answer: 309 if eval_job_item is None: 310 raise ValueError( 311 "Eval job item is required for reference answer evaluation" 312 ) 313 run_description = self.generate_ref_ans_run_description( 314 task_run.input, task_run.output.output, eval_job_item.output.output 315 ) 316 317 else: # EvalDataType.final_answer 318 run_description = self.generate_final_answer_run_description( 319 task_run.input, task_run.output.output 320 ) 321 322 # We don't need the run, but invoke_returning_run_output() runs validations for us over _run() 323 _, run_output = await adapter.invoke_returning_run_output(run_description) 324 325 if self.eval_config.config_type == EvalConfigType.llm_as_judge: 326 return self.build_llm_as_judge_score( 327 run_output 328 ), run_output.intermediate_outputs 329 else: 330 return self.build_g_eval_score(run_output), run_output.intermediate_outputs
Run this eval on the given task run.
332 def build_llm_as_judge_score(self, run_output: RunOutput) -> EvalScores: 333 """ 334 Build the LLM as Judge score for the given run and run output. 335 """ 336 # Convert the output format we asked for (discreet values) to our float scores 337 scores: EvalScores = {} 338 if not isinstance(run_output.output, dict): 339 raise ValueError("LLM as Judge output must be a dictionary") 340 341 for metric, score in run_output.output.items(): 342 token_score = self.score_from_token_string(f"{score}") 343 if token_score is None: 344 raise ValueError( 345 f"No score found for metric: {metric}. The LLM failed to follow the scoring rubric/instructions/schema." 346 ) 347 scores[metric] = token_score 348 return scores
Build the LLM as Judge score for the given run and run output.
350 def build_g_eval_score(self, run_output: RunOutput) -> EvalScores: 351 """ 352 Build the G-Eval score for the given run and run output. 353 354 We create a weighted average of each rating using the logprobs. 355 356 @misc{liu2023gevalnlgevaluationusing, 357 title={G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment}, 358 author={Yang Liu and Dan Iter and Yichong Xu and Shuohang Wang and Ruochen Xu and Chenguang Zhu}, 359 year={2023}, 360 eprint={2303.16634}, 361 archivePrefix={arXiv}, 362 primaryClass={cs.CL}, 363 url={https://arxiv.org/abs/2303.16634}, 364 } 365 """ 366 # We use structured output 367 outputs = run_output.output 368 assert isinstance(outputs, dict) 369 370 # Build raw string output from the logprobs, which is easier to work with than Dict for the next bit 371 raw_output = self.raw_output_from_logprobs(run_output) 372 373 # find the offset the start of each metric in the raw output json 374 metrics: List[str] = list(outputs.keys()) 375 metric_offsets = self.metric_offsets(raw_output, metrics) 376 377 final_scores: EvalScores = {} 378 for metric in metrics: 379 score = self.g_eval_single_metric( 380 run_output, metric, metric_offsets, raw_output 381 ) 382 if score is None: 383 raise ValueError( 384 f"No score found for metric: {metric}. The LLM failed to follow the scoring rubric/instructions/schema." 385 ) 386 final_scores[metric] = score 387 388 return final_scores
Build the G-Eval score for the given run and run output.
We create a weighted average of each rating using the logprobs.
@misc{liu2023gevalnlgevaluationusing, title={G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment}, author={Yang Liu and Dan Iter and Yichong Xu and Shuohang Wang and Ruochen Xu and Chenguang Zhu}, year={2023}, eprint={2303.16634}, archivePrefix={arXiv}, primaryClass={cs.CL}, url={https://arxiv.org/abs/2303.16634}, }
390 def g_eval_single_metric( 391 self, 392 run_output: RunOutput, 393 metric: str, 394 metric_offsets: Dict[str, int], 395 raw_output: str, 396 ) -> float | None: 397 """ 398 Run the G-Eval for a single metric. 399 400 Scan the logprobs for the metric and return the weighted score of the rating token. 401 """ 402 403 start_offset, end_offset = self.token_search_range( 404 raw_output, metric, metric_offsets 405 ) 406 407 offset = 0 408 409 if ( 410 run_output.output_logprobs is None 411 or run_output.output_logprobs.content is None 412 ): 413 raise RuntimeError( 414 "No logprobs found for output - can not calculate g-eval" 415 ) 416 417 # scan the tokens in the range, looking for the rating token 418 for _, chat_logprob in enumerate(run_output.output_logprobs.content): 419 if offset >= end_offset: 420 break 421 if offset >= start_offset: 422 score = self.rating_token_to_score(chat_logprob) 423 if score is not None: 424 return score 425 offset += len(chat_logprob.token) 426 427 return None
Run the G-Eval for a single metric.
Scan the logprobs for the metric and return the weighted score of the rating token.
429 def raw_output_from_logprobs(self, run_output: RunOutput) -> str: 430 """ 431 Build the raw output string from the logprobs. Generate from logprobs so it's guaranteed to match the logprobs offsets 432 """ 433 if ( 434 run_output.output_logprobs is None 435 or run_output.output_logprobs.content is None 436 ): 437 raise RuntimeError( 438 "No logprobs found for output - can not calculate g-eval" 439 ) 440 441 raw = "" 442 for chat_logprob in run_output.output_logprobs.content: 443 raw += chat_logprob.token 444 return raw
Build the raw output string from the logprobs. Generate from logprobs so it's guaranteed to match the logprobs offsets
446 def token_search_range( 447 self, raw_output: str, metric: str, metric_offsets: Dict[str, int] 448 ) -> Tuple[int, int]: 449 """ 450 Find the start and end offsets of the metric in the raw output. 451 452 Start searching after the end of the target metric json entry ("overall_rating":), and before the start of the next metric ("some_other_score"). 453 """ 454 start_offset = metric_offsets[metric] + len(metric) 455 456 # Find the lowest end offset that is greater than the start offset 457 end_offset = len(raw_output) 458 for v in list(metric_offsets.values()): 459 if v < end_offset and v > start_offset: 460 end_offset = v 461 462 return start_offset, end_offset
Find the start and end offsets of the metric in the raw output.
Start searching after the end of the target metric json entry ("overall_rating":), and before the start of the next metric ("some_other_score").
464 def rating_token_to_score( 465 self, token_logprob: ChatCompletionTokenLogprob 466 ) -> float | None: 467 """ 468 Convert a rating token to a score using weighted average of top logprobs. 469 470 Only includes tokens that have valid scores. 471 472 Some cleanup for upper case, whitespace and quotes. LLMs aren't always consistent. 473 """ 474 primary_token_score = self.score_from_token_string(token_logprob.token) 475 # check this is a real rating token, it could just be the ": ", "," or whitespace 476 if primary_token_score is None: 477 return None 478 479 total_score = 0.0 480 total_probability = 0.0 481 top_logprobs_contains_primary_token = False 482 483 # Process all valid scoring tokens from alternatives 484 for top_logprob in token_logprob.top_logprobs: 485 if top_logprob.token == token_logprob.token: 486 top_logprobs_contains_primary_token = True 487 token_score = self.score_from_token_string(top_logprob.token) 488 if token_score is not None: 489 # Convert logprob to probability 490 probability = math.exp(top_logprob.logprob) 491 total_score += token_score * probability 492 total_probability += probability 493 494 # Weird OpenAI 4o bug - sometimes the primary token is included in the top logprobs, sometimes not. 495 # Add the primary token back in if excluded 496 if not top_logprobs_contains_primary_token: 497 if token_logprob.logprob == -9999.0: 498 # Another "bug" - sometimes the logprob is -9999.0. This seems to happen when the rest of the logprobs are tiny probability. 499 total_score += primary_token_score * 1.0 500 total_probability += 1.0 501 else: 502 probability = math.exp(token_logprob.logprob) 503 total_score += primary_token_score * probability 504 total_probability += probability 505 506 if total_probability <= 0.0: 507 raise RuntimeError( 508 f"No valid scoring tokens found for {token_logprob.token}. This should never happen as the token has a valid score (so it must be excluded from top logprobs). Please file a bug if you see this." 509 ) 510 511 # Normalize by total probability of valid tokens (LLM may have wanted to generate other non-rating tokens, these shouldn't lower score of rating tokens) 512 weighted_score = total_score / total_probability 513 514 return weighted_score
Convert a rating token to a score using weighted average of top logprobs.
Only includes tokens that have valid scores.
Some cleanup for upper case, whitespace and quotes. LLMs aren't always consistent.
516 def score_from_token_string(self, token: str) -> float | None: 517 if token in TOKEN_TO_SCORE_MAP: 518 return TOKEN_TO_SCORE_MAP[token] 519 520 # handle more token variations like '"1"' and '"pass"' and ' paSS' and 'PASS' 521 unquoted_token = token.strip().strip('"').lower() 522 if unquoted_token in TOKEN_TO_SCORE_MAP: 523 return TOKEN_TO_SCORE_MAP[unquoted_token] 524 525 # handle numeric tokens like "1.0" 526 try: 527 float_value = float(token) 528 if float_value.is_integer(): 529 str_token = str(int(float_value)) 530 if str_token in TOKEN_TO_SCORE_MAP: 531 return TOKEN_TO_SCORE_MAP[str_token] 532 except ValueError: 533 pass 534 535 return None
537 def metric_offsets(self, raw_output: str, metrics: List[str]) -> Dict[str, int]: 538 """ 539 Find the offset to the start of each metric in the raw output json 540 541 For the example json: `{"overall_rating": 1}` == 1 542 543 should return: 544 { 545 "overall_rating": 1 # it's 1 character into the json string 546 } 547 """ 548 metric_offsets: Dict[str, int] = {} 549 for metric in metrics: 550 # the quoted metric name is expected in the json: `{"overall_rating": 1}` == 1 551 metric_name = f'"{metric}"' 552 553 # we expect it exactly once 554 count = raw_output.count(metric_name) 555 if count != 1: 556 raise ValueError( 557 f"Metric {metric} should appear exactly once in the output. Found {count} times" 558 ) 559 560 offset = raw_output.find(metric_name) 561 if offset == -1: 562 raise ValueError(f"Metric {metric} not found in raw output") 563 metric_offsets[metric] = offset 564 return metric_offsets
Find the offset to the start of each metric in the raw output json
For the example json: {"overall_rating": 1} == 1
should return: { "overall_rating": 1 # it's 1 character into the json string }