kiln_ai.adapters.eval.g_eval
1import math 2from typing import Dict, List, Tuple 3 4from litellm.types.utils import ChatCompletionTokenLogprob 5 6from kiln_ai.adapters.adapter_registry import adapter_for_task 7from kiln_ai.adapters.eval.base_eval import BaseEval 8from kiln_ai.adapters.eval.eval_utils.eval_trace_formatter import EvalTraceFormatter 9from kiln_ai.adapters.eval.eval_utils.eval_utils import EvalUtils 10from kiln_ai.adapters.ml_model_list import ( 11 default_structured_output_mode_for_model_provider, 12) 13from kiln_ai.adapters.model_adapters.base_adapter import AdapterConfig, RunOutput 14from kiln_ai.adapters.prompt_builders import PromptGenerators 15from kiln_ai.datamodel import Project, Task, TaskRun 16from kiln_ai.datamodel.eval import EvalConfig, EvalConfigType, EvalDataType, EvalScores 17from kiln_ai.datamodel.task import RunConfigProperties, StructuredOutputMode 18 19# all the tokens we score for, and their float scores. 20TOKEN_TO_SCORE_MAP: Dict[str, float] = { 21 "1": 1.0, 22 "2": 2.0, 23 "3": 3.0, 24 "4": 4.0, 25 "5": 5.0, 26 "pass": 1.0, 27 "fail": 0.0, 28 "critical": -1.0, 29} 30 31 32class GEvalTask(Task, parent_of={}): 33 """ 34 Kiln task for executing a G-Eval. Can be run on any Kiln adapter which supports logprobs. 35 36 Note G-Eval implements both G-Eval and LLM as Judge as they are very similar. 37 """ 38 39 def __init__(self, eval_config: EvalConfig): 40 tmp_project = Project(name="GEval") 41 42 # Build a simple LLM as Judge system instruction 43 system_instruction = "Your job to evaluate a model's performance on a task. Blocks will be marked with <eval_data> tags.\n" 44 # Optionally add a short task description 45 task_description = eval_config.properties.get("task_description", None) 46 if task_description: 47 system_instruction += f"\nThe task the model was given is as follows:\n<eval_data>\n<task_description>{task_description}</task_description>\n</eval_data>\n" 48 49 # Build the COT eval instructions 50 steps = eval_config.properties.get("eval_steps", []) 51 if not isinstance(steps, list): 52 raise ValueError("eval_steps must be a list.") 53 if len(steps) == 1: 54 cot_instructions = "First, think step by step about the model's performance following this evaluation step:\n\n" 55 cot_instructions += f"{steps[0]}\n" 56 else: 57 cot_instructions = "First, think step by step about the model's performance following these evaluation steps:\n\n" 58 for i, step in enumerate(steps): 59 cot_instructions += f"{i + 1}) {step}\n" 60 61 eval = eval_config.parent_eval() 62 if not eval: 63 raise ValueError("Eval config must have a parent eval") 64 65 # Build the output schema from the eval's target output scores. 66 # We restrict the LLM's output scoring schema to discrete scores (pass/fail/critical/1-5) - allow_float_scores=False 67 # However, the final scores from the evaluator can be a float (see later logprob calculation, which requires discrete token outputs) 68 output_schema = BaseEval.build_score_schema(eval, allow_float_scores=False) 69 70 super().__init__( 71 name="GEval Task", 72 parent=tmp_project, 73 instruction=system_instruction, 74 thinking_instruction=cot_instructions, 75 output_json_schema=output_schema, 76 ) 77 78 79class GEval(BaseEval): 80 """ 81 A evaluator which implements G-Eval and LLM as Judge. 82 83 G-Eval is a method of evaluating the quality of a model's output. It is a weighted average of the scores of the tokens in the output. The weights are the log probabilities of the tokens in the output. https://arxiv.org/abs/2303.16634 84 85 LLM as Judge is a method of evaluating the quality of a model's output. It simply asks the LLM to score, and uses the returned output (no logprobs needed). Also called direct evaluation. 86 87 @misc{liu2023gevalnlgevaluationusing, 88 title={G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment}, 89 author={Yang Liu and Dan Iter and Yichong Xu and Shuohang Wang and Ruochen Xu and Chenguang Zhu}, 90 year={2023}, 91 eprint={2303.16634}, 92 archivePrefix={arXiv}, 93 primaryClass={cs.CL}, 94 url={https://arxiv.org/abs/2303.16634}, 95 } 96 """ 97 98 def __init__(self, eval_config: EvalConfig, run_config: RunConfigProperties | None): 99 if ( 100 eval_config.config_type != EvalConfigType.g_eval 101 and eval_config.config_type != EvalConfigType.llm_as_judge 102 ): 103 raise ValueError( 104 f"GEval must be initialized with a GEval or LLM as Judge config_type. Got {eval_config.config_type}" 105 ) 106 107 super().__init__(eval_config, run_config) 108 109 self.geval_task = GEvalTask(eval_config) 110 111 def generate_final_answer_run_description( 112 self, eval_input: str, eval_output: str 113 ) -> str: 114 return f"""The model was given the following input for the task: 115<eval_data> 116{eval_input} 117</eval_data> 118 119The model produced the following output for the task: 120<eval_data> 121{eval_output} 122</eval_data> 123""" 124 125 def generate_ref_ans_run_description( 126 self, eval_input: str, eval_output: str, reference_answer: str 127 ) -> str: 128 return f"""The model was given the following input for the task: 129<eval_data> 130{eval_input} 131</eval_data> 132 133The model produced the following output for the task: 134<eval_data> 135{eval_output} 136</eval_data> 137 138This is the reference answer: 139<eval_data> 140{reference_answer} 141</eval_data> 142""" 143 144 def generate_full_trace_run_description( 145 self, 146 eval_input: str, 147 available_tools: str | None, 148 conversation_history: str, 149 ) -> str: 150 description = "" 151 description += f"""The model was given the following <user_input> for the <task_description>: 152<eval_data> 153<user_input>{eval_input}</user_input> 154</eval_data> 155""" 156 appropriate_tool_use_guidelines = str( 157 self.eval.template_properties.get("appropriate_tool_use_guidelines") or "" 158 ) 159 description += """The model was given the following <appropriate_tool_use_guidelines> guidelines:""" 160 description += f""" 161<eval_data> 162<appropriate_tool_use_guidelines> 163{appropriate_tool_use_guidelines} 164</appropriate_tool_use_guidelines> 165</eval_data> 166""" 167 inappropriate_tool_use_guidelines = str( 168 self.eval.template_properties.get("inappropriate_tool_use_guidelines") or "" 169 ) 170 # Only include if it has content since it is optional 171 if inappropriate_tool_use_guidelines: 172 description += """The model was given the following <inappropriate_tool_use_guidelines> guidelines:""" 173 description += f""" 174<eval_data> 175<inappropriate_tool_use_guidelines> 176{inappropriate_tool_use_guidelines} 177</inappropriate_tool_use_guidelines> 178</eval_data> 179""" 180 181 if available_tools is not None: 182 if available_tools != "": 183 description += f""" 184This is the list of tools available to the model: 185<eval_data> 186<available_tools>{available_tools}</available_tools> 187</eval_data> 188""" 189 else: 190 description += """ 191There were no tools available to the model. 192""" 193 194 description += f""" 195This is the full conversation history for the task run: 196<eval_data> 197<conversation_history>{conversation_history}</conversation_history> 198</eval_data> 199""" 200 return description 201 202 async def run_eval( 203 self, task_run: TaskRun, eval_job_item: TaskRun | None = None 204 ) -> tuple[EvalScores, Dict[str, str] | None]: 205 """ 206 Run this eval on the given task run. 207 """ 208 209 model_name, provider = self.model_and_provider() 210 211 # Only fetch logprobs for G-Eval 212 # There are at most 5 valid rating tokens per rating type (five_star being largest), so 10 is more than enough to get to the very very unlikely 213 top_logprobs = ( 214 10 if self.eval_config.config_type == EvalConfigType.g_eval else None 215 ) 216 217 # We don't expose setting this manually in the UI, so pull a recommended mode from ml_model_list 218 structured_output_mode = default_structured_output_mode_for_model_provider( 219 model_name, 220 provider, 221 default=StructuredOutputMode.json_schema, 222 # G-eval expects JSON, so don't allow function calling modes 223 disallowed_modes=[ 224 StructuredOutputMode.function_calling, 225 StructuredOutputMode.function_calling_weak, 226 ], 227 ) 228 229 adapter = adapter_for_task( 230 self.geval_task, 231 run_config_properties=RunConfigProperties( 232 model_name=model_name, 233 model_provider_name=provider, 234 # We always use Simple COT for G-Eval and LLM as Judge 235 prompt_id=PromptGenerators.SIMPLE_CHAIN_OF_THOUGHT, 236 structured_output_mode=structured_output_mode, 237 ), 238 base_adapter_config=AdapterConfig( 239 # Don't save this run into the task_runs. It will be saved into an eval_run where it belongs 240 allow_saving=False, 241 top_logprobs=top_logprobs, 242 ), 243 ) 244 245 if self.eval.evaluation_data_type == EvalDataType.full_trace: 246 if task_run.trace is None: 247 raise ValueError("Task run trace is required for full trace evaluation") 248 249 available_tools = await EvalUtils.formatted_available_tools_from_task_run( 250 task_run 251 ) 252 run_description = self.generate_full_trace_run_description( 253 task_run.input, 254 available_tools, 255 EvalTraceFormatter.trace_to_formatted_conversation_history( 256 task_run.trace 257 ), 258 ) 259 260 elif self.eval.evaluation_data_type == EvalDataType.reference_answer: 261 if eval_job_item is None: 262 raise ValueError( 263 "Eval job item is required for reference answer evaluation" 264 ) 265 run_description = self.generate_ref_ans_run_description( 266 task_run.input, task_run.output.output, eval_job_item.output.output 267 ) 268 269 else: # EvalDataType.final_answer 270 run_description = self.generate_final_answer_run_description( 271 task_run.input, task_run.output.output 272 ) 273 274 # We don't need the run, but invoke_returning_run_output() runs validations for us over _run() 275 _, run_output = await adapter.invoke_returning_run_output(run_description) 276 277 if self.eval_config.config_type == EvalConfigType.llm_as_judge: 278 return self.build_llm_as_judge_score( 279 run_output 280 ), run_output.intermediate_outputs 281 else: 282 return self.build_g_eval_score(run_output), run_output.intermediate_outputs 283 284 def build_llm_as_judge_score(self, run_output: RunOutput) -> EvalScores: 285 """ 286 Build the LLM as Judge score for the given run and run output. 287 """ 288 # Convert the output format we asked for (discreet values) to our float scores 289 scores: EvalScores = {} 290 if not isinstance(run_output.output, dict): 291 raise ValueError("LLM as Judge output must be a dictionary") 292 293 for metric, score in run_output.output.items(): 294 token_score = self.score_from_token_string(f"{score}") 295 if token_score is None: 296 raise ValueError( 297 f"No score found for metric: {metric}. The LLM failed to follow the scoring rubric/instructions/schema." 298 ) 299 scores[metric] = token_score 300 return scores 301 302 def build_g_eval_score(self, run_output: RunOutput) -> EvalScores: 303 """ 304 Build the G-Eval score for the given run and run output. 305 306 We create a weighted average of each rating using the logprobs. 307 308 @misc{liu2023gevalnlgevaluationusing, 309 title={G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment}, 310 author={Yang Liu and Dan Iter and Yichong Xu and Shuohang Wang and Ruochen Xu and Chenguang Zhu}, 311 year={2023}, 312 eprint={2303.16634}, 313 archivePrefix={arXiv}, 314 primaryClass={cs.CL}, 315 url={https://arxiv.org/abs/2303.16634}, 316 } 317 """ 318 # We use structured output 319 outputs = run_output.output 320 assert isinstance(outputs, dict) 321 322 # Build raw string output from the logprobs, which is easier to work with than Dict for the next bit 323 raw_output = self.raw_output_from_logprobs(run_output) 324 325 # find the offset the start of each metric in the raw output json 326 metrics: List[str] = list(outputs.keys()) 327 metric_offsets = self.metric_offsets(raw_output, metrics) 328 329 final_scores: EvalScores = {} 330 for metric in metrics: 331 score = self.g_eval_single_metric( 332 run_output, metric, metric_offsets, raw_output 333 ) 334 if score is None: 335 raise ValueError( 336 f"No score found for metric: {metric}. The LLM failed to follow the scoring rubric/instructions/schema." 337 ) 338 final_scores[metric] = score 339 340 return final_scores 341 342 def g_eval_single_metric( 343 self, 344 run_output: RunOutput, 345 metric: str, 346 metric_offsets: Dict[str, int], 347 raw_output: str, 348 ) -> float | None: 349 """ 350 Run the G-Eval for a single metric. 351 352 Scan the logprobs for the metric and return the weighted score of the rating token. 353 """ 354 355 start_offset, end_offset = self.token_search_range( 356 raw_output, metric, metric_offsets 357 ) 358 359 offset = 0 360 361 if ( 362 run_output.output_logprobs is None 363 or run_output.output_logprobs.content is None 364 ): 365 raise RuntimeError( 366 "No logprobs found for output - can not calculate g-eval" 367 ) 368 369 # scan the tokens in the range, looking for the rating token 370 for _, chat_logprob in enumerate(run_output.output_logprobs.content): 371 if offset >= end_offset: 372 break 373 if offset >= start_offset: 374 score = self.rating_token_to_score(chat_logprob) 375 if score is not None: 376 return score 377 offset += len(chat_logprob.token) 378 379 return None 380 381 def raw_output_from_logprobs(self, run_output: RunOutput) -> str: 382 """ 383 Build the raw output string from the logprobs. Generate from logprobs so it's guaranteed to match the logprobs offsets 384 """ 385 if ( 386 run_output.output_logprobs is None 387 or run_output.output_logprobs.content is None 388 ): 389 raise RuntimeError( 390 "No logprobs found for output - can not calculate g-eval" 391 ) 392 393 raw = "" 394 for chat_logprob in run_output.output_logprobs.content: 395 raw += chat_logprob.token 396 return raw 397 398 def token_search_range( 399 self, raw_output: str, metric: str, metric_offsets: Dict[str, int] 400 ) -> Tuple[int, int]: 401 """ 402 Find the start and end offsets of the metric in the raw output. 403 404 Start searching after the end of the target metric json entry ("overall_rating":), and before the start of the next metric ("some_other_score"). 405 """ 406 start_offset = metric_offsets[metric] + len(metric) 407 408 # Find the lowest end offset that is greater than the start offset 409 end_offset = len(raw_output) 410 for v in list(metric_offsets.values()): 411 if v < end_offset and v > start_offset: 412 end_offset = v 413 414 return start_offset, end_offset 415 416 def rating_token_to_score( 417 self, token_logprob: ChatCompletionTokenLogprob 418 ) -> float | None: 419 """ 420 Convert a rating token to a score using weighted average of top logprobs. 421 422 Only includes tokens that have valid scores. 423 424 Some cleanup for upper case, whitespace and quotes. LLMs aren't always consistent. 425 """ 426 primary_token_score = self.score_from_token_string(token_logprob.token) 427 # check this is a real rating token, it could just be the ": ", "," or whitespace 428 if primary_token_score is None: 429 return None 430 431 total_score = 0.0 432 total_probability = 0.0 433 top_logprobs_contains_primary_token = False 434 435 # Process all valid scoring tokens from alternatives 436 for top_logprob in token_logprob.top_logprobs: 437 if top_logprob.token == token_logprob.token: 438 top_logprobs_contains_primary_token = True 439 token_score = self.score_from_token_string(top_logprob.token) 440 if token_score is not None: 441 # Convert logprob to probability 442 probability = math.exp(top_logprob.logprob) 443 total_score += token_score * probability 444 total_probability += probability 445 446 # Weird OpenAI 4o bug - sometimes the primary token is included in the top logprobs, sometimes not. 447 # Add the primary token back in if excluded 448 if not top_logprobs_contains_primary_token: 449 if token_logprob.logprob == -9999.0: 450 # Another "bug" - sometimes the logprob is -9999.0. This seems to happen when the rest of the logprobs are tiny probability. 451 total_score += primary_token_score * 1.0 452 total_probability += 1.0 453 else: 454 probability = math.exp(token_logprob.logprob) 455 total_score += primary_token_score * probability 456 total_probability += probability 457 458 if total_probability <= 0.0: 459 raise RuntimeError( 460 f"No valid scoring tokens found for {token_logprob.token}. This should never happen as the token has a valid score (so it must be excluded from top logprobs). Please file a bug if you see this." 461 ) 462 463 # Normalize by total probability of valid tokens (LLM may have wanted to generate other non-rating tokens, these shouldn't lower score of rating tokens) 464 weighted_score = total_score / total_probability 465 466 return weighted_score 467 468 def score_from_token_string(self, token: str) -> float | None: 469 if token in TOKEN_TO_SCORE_MAP: 470 return TOKEN_TO_SCORE_MAP[token] 471 472 # handle more token variations like '"1"' and '"pass"' and ' paSS' and 'PASS' 473 unquoted_token = token.strip().strip('"').lower() 474 if unquoted_token in TOKEN_TO_SCORE_MAP: 475 return TOKEN_TO_SCORE_MAP[unquoted_token] 476 477 # handle numeric tokens like "1.0" 478 try: 479 float_value = float(token) 480 if float_value.is_integer(): 481 str_token = str(int(float_value)) 482 if str_token in TOKEN_TO_SCORE_MAP: 483 return TOKEN_TO_SCORE_MAP[str_token] 484 except ValueError: 485 pass 486 487 return None 488 489 def metric_offsets(self, raw_output: str, metrics: List[str]) -> Dict[str, int]: 490 """ 491 Find the offset to the start of each metric in the raw output json 492 493 For the example json: `{"overall_rating": 1}` == 1 494 495 should return: 496 { 497 "overall_rating": 1 # it's 1 character into the json string 498 } 499 """ 500 metric_offsets: Dict[str, int] = {} 501 for metric in metrics: 502 # the quoted metric name is expected in the json: `{"overall_rating": 1}` == 1 503 metric_name = f'"{metric}"' 504 505 # we expect it exactly once 506 count = raw_output.count(metric_name) 507 if count != 1: 508 raise ValueError( 509 f"Metric {metric} should appear exactly once in the output. Found {count} times" 510 ) 511 512 offset = raw_output.find(metric_name) 513 if offset == -1: 514 raise ValueError(f"Metric {metric} not found in raw output") 515 metric_offsets[metric] = offset 516 return metric_offsets
33class GEvalTask(Task, parent_of={}): 34 """ 35 Kiln task for executing a G-Eval. Can be run on any Kiln adapter which supports logprobs. 36 37 Note G-Eval implements both G-Eval and LLM as Judge as they are very similar. 38 """ 39 40 def __init__(self, eval_config: EvalConfig): 41 tmp_project = Project(name="GEval") 42 43 # Build a simple LLM as Judge system instruction 44 system_instruction = "Your job to evaluate a model's performance on a task. Blocks will be marked with <eval_data> tags.\n" 45 # Optionally add a short task description 46 task_description = eval_config.properties.get("task_description", None) 47 if task_description: 48 system_instruction += f"\nThe task the model was given is as follows:\n<eval_data>\n<task_description>{task_description}</task_description>\n</eval_data>\n" 49 50 # Build the COT eval instructions 51 steps = eval_config.properties.get("eval_steps", []) 52 if not isinstance(steps, list): 53 raise ValueError("eval_steps must be a list.") 54 if len(steps) == 1: 55 cot_instructions = "First, think step by step about the model's performance following this evaluation step:\n\n" 56 cot_instructions += f"{steps[0]}\n" 57 else: 58 cot_instructions = "First, think step by step about the model's performance following these evaluation steps:\n\n" 59 for i, step in enumerate(steps): 60 cot_instructions += f"{i + 1}) {step}\n" 61 62 eval = eval_config.parent_eval() 63 if not eval: 64 raise ValueError("Eval config must have a parent eval") 65 66 # Build the output schema from the eval's target output scores. 67 # We restrict the LLM's output scoring schema to discrete scores (pass/fail/critical/1-5) - allow_float_scores=False 68 # However, the final scores from the evaluator can be a float (see later logprob calculation, which requires discrete token outputs) 69 output_schema = BaseEval.build_score_schema(eval, allow_float_scores=False) 70 71 super().__init__( 72 name="GEval Task", 73 parent=tmp_project, 74 instruction=system_instruction, 75 thinking_instruction=cot_instructions, 76 output_json_schema=output_schema, 77 )
Kiln task for executing a G-Eval. Can be run on any Kiln adapter which supports logprobs.
Note G-Eval implements both G-Eval and LLM as Judge as they are very similar.
40 def __init__(self, eval_config: EvalConfig): 41 tmp_project = Project(name="GEval") 42 43 # Build a simple LLM as Judge system instruction 44 system_instruction = "Your job to evaluate a model's performance on a task. Blocks will be marked with <eval_data> tags.\n" 45 # Optionally add a short task description 46 task_description = eval_config.properties.get("task_description", None) 47 if task_description: 48 system_instruction += f"\nThe task the model was given is as follows:\n<eval_data>\n<task_description>{task_description}</task_description>\n</eval_data>\n" 49 50 # Build the COT eval instructions 51 steps = eval_config.properties.get("eval_steps", []) 52 if not isinstance(steps, list): 53 raise ValueError("eval_steps must be a list.") 54 if len(steps) == 1: 55 cot_instructions = "First, think step by step about the model's performance following this evaluation step:\n\n" 56 cot_instructions += f"{steps[0]}\n" 57 else: 58 cot_instructions = "First, think step by step about the model's performance following these evaluation steps:\n\n" 59 for i, step in enumerate(steps): 60 cot_instructions += f"{i + 1}) {step}\n" 61 62 eval = eval_config.parent_eval() 63 if not eval: 64 raise ValueError("Eval config must have a parent eval") 65 66 # Build the output schema from the eval's target output scores. 67 # We restrict the LLM's output scoring schema to discrete scores (pass/fail/critical/1-5) - allow_float_scores=False 68 # However, the final scores from the evaluator can be a float (see later logprob calculation, which requires discrete token outputs) 69 output_schema = BaseEval.build_score_schema(eval, allow_float_scores=False) 70 71 super().__init__( 72 name="GEval Task", 73 parent=tmp_project, 74 instruction=system_instruction, 75 thinking_instruction=cot_instructions, 76 output_json_schema=output_schema, 77 )
Create a new model by parsing and validating input data from keyword arguments.
Raises [ValidationError][pydantic_core.ValidationError] if the input data cannot be
validated to form a valid model.
self is explicitly positional-only to allow self as a field name.
Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].
337def init_private_attributes(self: BaseModel, context: Any, /) -> None: 338 """This function is meant to behave like a BaseModel method to initialise private attributes. 339 340 It takes context as an argument since that's what pydantic-core passes when calling it. 341 342 Args: 343 self: The BaseModel instance. 344 context: The context. 345 """ 346 if getattr(self, '__pydantic_private__', None) is None: 347 pydantic_private = {} 348 for name, private_attr in self.__private_attributes__.items(): 349 default = private_attr.get_default() 350 if default is not PydanticUndefined: 351 pydantic_private[name] = default 352 object_setattr(self, '__pydantic_private__', pydantic_private)
This function is meant to behave like a BaseModel method to initialise private attributes.
It takes context as an argument since that's what pydantic-core passes when calling it.
Args: self: The BaseModel instance. context: The context.
80class GEval(BaseEval): 81 """ 82 A evaluator which implements G-Eval and LLM as Judge. 83 84 G-Eval is a method of evaluating the quality of a model's output. It is a weighted average of the scores of the tokens in the output. The weights are the log probabilities of the tokens in the output. https://arxiv.org/abs/2303.16634 85 86 LLM as Judge is a method of evaluating the quality of a model's output. It simply asks the LLM to score, and uses the returned output (no logprobs needed). Also called direct evaluation. 87 88 @misc{liu2023gevalnlgevaluationusing, 89 title={G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment}, 90 author={Yang Liu and Dan Iter and Yichong Xu and Shuohang Wang and Ruochen Xu and Chenguang Zhu}, 91 year={2023}, 92 eprint={2303.16634}, 93 archivePrefix={arXiv}, 94 primaryClass={cs.CL}, 95 url={https://arxiv.org/abs/2303.16634}, 96 } 97 """ 98 99 def __init__(self, eval_config: EvalConfig, run_config: RunConfigProperties | None): 100 if ( 101 eval_config.config_type != EvalConfigType.g_eval 102 and eval_config.config_type != EvalConfigType.llm_as_judge 103 ): 104 raise ValueError( 105 f"GEval must be initialized with a GEval or LLM as Judge config_type. Got {eval_config.config_type}" 106 ) 107 108 super().__init__(eval_config, run_config) 109 110 self.geval_task = GEvalTask(eval_config) 111 112 def generate_final_answer_run_description( 113 self, eval_input: str, eval_output: str 114 ) -> str: 115 return f"""The model was given the following input for the task: 116<eval_data> 117{eval_input} 118</eval_data> 119 120The model produced the following output for the task: 121<eval_data> 122{eval_output} 123</eval_data> 124""" 125 126 def generate_ref_ans_run_description( 127 self, eval_input: str, eval_output: str, reference_answer: str 128 ) -> str: 129 return f"""The model was given the following input for the task: 130<eval_data> 131{eval_input} 132</eval_data> 133 134The model produced the following output for the task: 135<eval_data> 136{eval_output} 137</eval_data> 138 139This is the reference answer: 140<eval_data> 141{reference_answer} 142</eval_data> 143""" 144 145 def generate_full_trace_run_description( 146 self, 147 eval_input: str, 148 available_tools: str | None, 149 conversation_history: str, 150 ) -> str: 151 description = "" 152 description += f"""The model was given the following <user_input> for the <task_description>: 153<eval_data> 154<user_input>{eval_input}</user_input> 155</eval_data> 156""" 157 appropriate_tool_use_guidelines = str( 158 self.eval.template_properties.get("appropriate_tool_use_guidelines") or "" 159 ) 160 description += """The model was given the following <appropriate_tool_use_guidelines> guidelines:""" 161 description += f""" 162<eval_data> 163<appropriate_tool_use_guidelines> 164{appropriate_tool_use_guidelines} 165</appropriate_tool_use_guidelines> 166</eval_data> 167""" 168 inappropriate_tool_use_guidelines = str( 169 self.eval.template_properties.get("inappropriate_tool_use_guidelines") or "" 170 ) 171 # Only include if it has content since it is optional 172 if inappropriate_tool_use_guidelines: 173 description += """The model was given the following <inappropriate_tool_use_guidelines> guidelines:""" 174 description += f""" 175<eval_data> 176<inappropriate_tool_use_guidelines> 177{inappropriate_tool_use_guidelines} 178</inappropriate_tool_use_guidelines> 179</eval_data> 180""" 181 182 if available_tools is not None: 183 if available_tools != "": 184 description += f""" 185This is the list of tools available to the model: 186<eval_data> 187<available_tools>{available_tools}</available_tools> 188</eval_data> 189""" 190 else: 191 description += """ 192There were no tools available to the model. 193""" 194 195 description += f""" 196This is the full conversation history for the task run: 197<eval_data> 198<conversation_history>{conversation_history}</conversation_history> 199</eval_data> 200""" 201 return description 202 203 async def run_eval( 204 self, task_run: TaskRun, eval_job_item: TaskRun | None = None 205 ) -> tuple[EvalScores, Dict[str, str] | None]: 206 """ 207 Run this eval on the given task run. 208 """ 209 210 model_name, provider = self.model_and_provider() 211 212 # Only fetch logprobs for G-Eval 213 # There are at most 5 valid rating tokens per rating type (five_star being largest), so 10 is more than enough to get to the very very unlikely 214 top_logprobs = ( 215 10 if self.eval_config.config_type == EvalConfigType.g_eval else None 216 ) 217 218 # We don't expose setting this manually in the UI, so pull a recommended mode from ml_model_list 219 structured_output_mode = default_structured_output_mode_for_model_provider( 220 model_name, 221 provider, 222 default=StructuredOutputMode.json_schema, 223 # G-eval expects JSON, so don't allow function calling modes 224 disallowed_modes=[ 225 StructuredOutputMode.function_calling, 226 StructuredOutputMode.function_calling_weak, 227 ], 228 ) 229 230 adapter = adapter_for_task( 231 self.geval_task, 232 run_config_properties=RunConfigProperties( 233 model_name=model_name, 234 model_provider_name=provider, 235 # We always use Simple COT for G-Eval and LLM as Judge 236 prompt_id=PromptGenerators.SIMPLE_CHAIN_OF_THOUGHT, 237 structured_output_mode=structured_output_mode, 238 ), 239 base_adapter_config=AdapterConfig( 240 # Don't save this run into the task_runs. It will be saved into an eval_run where it belongs 241 allow_saving=False, 242 top_logprobs=top_logprobs, 243 ), 244 ) 245 246 if self.eval.evaluation_data_type == EvalDataType.full_trace: 247 if task_run.trace is None: 248 raise ValueError("Task run trace is required for full trace evaluation") 249 250 available_tools = await EvalUtils.formatted_available_tools_from_task_run( 251 task_run 252 ) 253 run_description = self.generate_full_trace_run_description( 254 task_run.input, 255 available_tools, 256 EvalTraceFormatter.trace_to_formatted_conversation_history( 257 task_run.trace 258 ), 259 ) 260 261 elif self.eval.evaluation_data_type == EvalDataType.reference_answer: 262 if eval_job_item is None: 263 raise ValueError( 264 "Eval job item is required for reference answer evaluation" 265 ) 266 run_description = self.generate_ref_ans_run_description( 267 task_run.input, task_run.output.output, eval_job_item.output.output 268 ) 269 270 else: # EvalDataType.final_answer 271 run_description = self.generate_final_answer_run_description( 272 task_run.input, task_run.output.output 273 ) 274 275 # We don't need the run, but invoke_returning_run_output() runs validations for us over _run() 276 _, run_output = await adapter.invoke_returning_run_output(run_description) 277 278 if self.eval_config.config_type == EvalConfigType.llm_as_judge: 279 return self.build_llm_as_judge_score( 280 run_output 281 ), run_output.intermediate_outputs 282 else: 283 return self.build_g_eval_score(run_output), run_output.intermediate_outputs 284 285 def build_llm_as_judge_score(self, run_output: RunOutput) -> EvalScores: 286 """ 287 Build the LLM as Judge score for the given run and run output. 288 """ 289 # Convert the output format we asked for (discreet values) to our float scores 290 scores: EvalScores = {} 291 if not isinstance(run_output.output, dict): 292 raise ValueError("LLM as Judge output must be a dictionary") 293 294 for metric, score in run_output.output.items(): 295 token_score = self.score_from_token_string(f"{score}") 296 if token_score is None: 297 raise ValueError( 298 f"No score found for metric: {metric}. The LLM failed to follow the scoring rubric/instructions/schema." 299 ) 300 scores[metric] = token_score 301 return scores 302 303 def build_g_eval_score(self, run_output: RunOutput) -> EvalScores: 304 """ 305 Build the G-Eval score for the given run and run output. 306 307 We create a weighted average of each rating using the logprobs. 308 309 @misc{liu2023gevalnlgevaluationusing, 310 title={G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment}, 311 author={Yang Liu and Dan Iter and Yichong Xu and Shuohang Wang and Ruochen Xu and Chenguang Zhu}, 312 year={2023}, 313 eprint={2303.16634}, 314 archivePrefix={arXiv}, 315 primaryClass={cs.CL}, 316 url={https://arxiv.org/abs/2303.16634}, 317 } 318 """ 319 # We use structured output 320 outputs = run_output.output 321 assert isinstance(outputs, dict) 322 323 # Build raw string output from the logprobs, which is easier to work with than Dict for the next bit 324 raw_output = self.raw_output_from_logprobs(run_output) 325 326 # find the offset the start of each metric in the raw output json 327 metrics: List[str] = list(outputs.keys()) 328 metric_offsets = self.metric_offsets(raw_output, metrics) 329 330 final_scores: EvalScores = {} 331 for metric in metrics: 332 score = self.g_eval_single_metric( 333 run_output, metric, metric_offsets, raw_output 334 ) 335 if score is None: 336 raise ValueError( 337 f"No score found for metric: {metric}. The LLM failed to follow the scoring rubric/instructions/schema." 338 ) 339 final_scores[metric] = score 340 341 return final_scores 342 343 def g_eval_single_metric( 344 self, 345 run_output: RunOutput, 346 metric: str, 347 metric_offsets: Dict[str, int], 348 raw_output: str, 349 ) -> float | None: 350 """ 351 Run the G-Eval for a single metric. 352 353 Scan the logprobs for the metric and return the weighted score of the rating token. 354 """ 355 356 start_offset, end_offset = self.token_search_range( 357 raw_output, metric, metric_offsets 358 ) 359 360 offset = 0 361 362 if ( 363 run_output.output_logprobs is None 364 or run_output.output_logprobs.content is None 365 ): 366 raise RuntimeError( 367 "No logprobs found for output - can not calculate g-eval" 368 ) 369 370 # scan the tokens in the range, looking for the rating token 371 for _, chat_logprob in enumerate(run_output.output_logprobs.content): 372 if offset >= end_offset: 373 break 374 if offset >= start_offset: 375 score = self.rating_token_to_score(chat_logprob) 376 if score is not None: 377 return score 378 offset += len(chat_logprob.token) 379 380 return None 381 382 def raw_output_from_logprobs(self, run_output: RunOutput) -> str: 383 """ 384 Build the raw output string from the logprobs. Generate from logprobs so it's guaranteed to match the logprobs offsets 385 """ 386 if ( 387 run_output.output_logprobs is None 388 or run_output.output_logprobs.content is None 389 ): 390 raise RuntimeError( 391 "No logprobs found for output - can not calculate g-eval" 392 ) 393 394 raw = "" 395 for chat_logprob in run_output.output_logprobs.content: 396 raw += chat_logprob.token 397 return raw 398 399 def token_search_range( 400 self, raw_output: str, metric: str, metric_offsets: Dict[str, int] 401 ) -> Tuple[int, int]: 402 """ 403 Find the start and end offsets of the metric in the raw output. 404 405 Start searching after the end of the target metric json entry ("overall_rating":), and before the start of the next metric ("some_other_score"). 406 """ 407 start_offset = metric_offsets[metric] + len(metric) 408 409 # Find the lowest end offset that is greater than the start offset 410 end_offset = len(raw_output) 411 for v in list(metric_offsets.values()): 412 if v < end_offset and v > start_offset: 413 end_offset = v 414 415 return start_offset, end_offset 416 417 def rating_token_to_score( 418 self, token_logprob: ChatCompletionTokenLogprob 419 ) -> float | None: 420 """ 421 Convert a rating token to a score using weighted average of top logprobs. 422 423 Only includes tokens that have valid scores. 424 425 Some cleanup for upper case, whitespace and quotes. LLMs aren't always consistent. 426 """ 427 primary_token_score = self.score_from_token_string(token_logprob.token) 428 # check this is a real rating token, it could just be the ": ", "," or whitespace 429 if primary_token_score is None: 430 return None 431 432 total_score = 0.0 433 total_probability = 0.0 434 top_logprobs_contains_primary_token = False 435 436 # Process all valid scoring tokens from alternatives 437 for top_logprob in token_logprob.top_logprobs: 438 if top_logprob.token == token_logprob.token: 439 top_logprobs_contains_primary_token = True 440 token_score = self.score_from_token_string(top_logprob.token) 441 if token_score is not None: 442 # Convert logprob to probability 443 probability = math.exp(top_logprob.logprob) 444 total_score += token_score * probability 445 total_probability += probability 446 447 # Weird OpenAI 4o bug - sometimes the primary token is included in the top logprobs, sometimes not. 448 # Add the primary token back in if excluded 449 if not top_logprobs_contains_primary_token: 450 if token_logprob.logprob == -9999.0: 451 # Another "bug" - sometimes the logprob is -9999.0. This seems to happen when the rest of the logprobs are tiny probability. 452 total_score += primary_token_score * 1.0 453 total_probability += 1.0 454 else: 455 probability = math.exp(token_logprob.logprob) 456 total_score += primary_token_score * probability 457 total_probability += probability 458 459 if total_probability <= 0.0: 460 raise RuntimeError( 461 f"No valid scoring tokens found for {token_logprob.token}. This should never happen as the token has a valid score (so it must be excluded from top logprobs). Please file a bug if you see this." 462 ) 463 464 # Normalize by total probability of valid tokens (LLM may have wanted to generate other non-rating tokens, these shouldn't lower score of rating tokens) 465 weighted_score = total_score / total_probability 466 467 return weighted_score 468 469 def score_from_token_string(self, token: str) -> float | None: 470 if token in TOKEN_TO_SCORE_MAP: 471 return TOKEN_TO_SCORE_MAP[token] 472 473 # handle more token variations like '"1"' and '"pass"' and ' paSS' and 'PASS' 474 unquoted_token = token.strip().strip('"').lower() 475 if unquoted_token in TOKEN_TO_SCORE_MAP: 476 return TOKEN_TO_SCORE_MAP[unquoted_token] 477 478 # handle numeric tokens like "1.0" 479 try: 480 float_value = float(token) 481 if float_value.is_integer(): 482 str_token = str(int(float_value)) 483 if str_token in TOKEN_TO_SCORE_MAP: 484 return TOKEN_TO_SCORE_MAP[str_token] 485 except ValueError: 486 pass 487 488 return None 489 490 def metric_offsets(self, raw_output: str, metrics: List[str]) -> Dict[str, int]: 491 """ 492 Find the offset to the start of each metric in the raw output json 493 494 For the example json: `{"overall_rating": 1}` == 1 495 496 should return: 497 { 498 "overall_rating": 1 # it's 1 character into the json string 499 } 500 """ 501 metric_offsets: Dict[str, int] = {} 502 for metric in metrics: 503 # the quoted metric name is expected in the json: `{"overall_rating": 1}` == 1 504 metric_name = f'"{metric}"' 505 506 # we expect it exactly once 507 count = raw_output.count(metric_name) 508 if count != 1: 509 raise ValueError( 510 f"Metric {metric} should appear exactly once in the output. Found {count} times" 511 ) 512 513 offset = raw_output.find(metric_name) 514 if offset == -1: 515 raise ValueError(f"Metric {metric} not found in raw output") 516 metric_offsets[metric] = offset 517 return metric_offsets
A evaluator which implements G-Eval and LLM as Judge.
G-Eval is a method of evaluating the quality of a model's output. It is a weighted average of the scores of the tokens in the output. The weights are the log probabilities of the tokens in the output. https://arxiv.org/abs/2303.16634
LLM as Judge is a method of evaluating the quality of a model's output. It simply asks the LLM to score, and uses the returned output (no logprobs needed). Also called direct evaluation.
@misc{liu2023gevalnlgevaluationusing, title={G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment}, author={Yang Liu and Dan Iter and Yichong Xu and Shuohang Wang and Ruochen Xu and Chenguang Zhu}, year={2023}, eprint={2303.16634}, archivePrefix={arXiv}, primaryClass={cs.CL}, url={https://arxiv.org/abs/2303.16634}, }
99 def __init__(self, eval_config: EvalConfig, run_config: RunConfigProperties | None): 100 if ( 101 eval_config.config_type != EvalConfigType.g_eval 102 and eval_config.config_type != EvalConfigType.llm_as_judge 103 ): 104 raise ValueError( 105 f"GEval must be initialized with a GEval or LLM as Judge config_type. Got {eval_config.config_type}" 106 ) 107 108 super().__init__(eval_config, run_config) 109 110 self.geval_task = GEvalTask(eval_config)
112 def generate_final_answer_run_description( 113 self, eval_input: str, eval_output: str 114 ) -> str: 115 return f"""The model was given the following input for the task: 116<eval_data> 117{eval_input} 118</eval_data> 119 120The model produced the following output for the task: 121<eval_data> 122{eval_output} 123</eval_data> 124"""
126 def generate_ref_ans_run_description( 127 self, eval_input: str, eval_output: str, reference_answer: str 128 ) -> str: 129 return f"""The model was given the following input for the task: 130<eval_data> 131{eval_input} 132</eval_data> 133 134The model produced the following output for the task: 135<eval_data> 136{eval_output} 137</eval_data> 138 139This is the reference answer: 140<eval_data> 141{reference_answer} 142</eval_data> 143"""
145 def generate_full_trace_run_description( 146 self, 147 eval_input: str, 148 available_tools: str | None, 149 conversation_history: str, 150 ) -> str: 151 description = "" 152 description += f"""The model was given the following <user_input> for the <task_description>: 153<eval_data> 154<user_input>{eval_input}</user_input> 155</eval_data> 156""" 157 appropriate_tool_use_guidelines = str( 158 self.eval.template_properties.get("appropriate_tool_use_guidelines") or "" 159 ) 160 description += """The model was given the following <appropriate_tool_use_guidelines> guidelines:""" 161 description += f""" 162<eval_data> 163<appropriate_tool_use_guidelines> 164{appropriate_tool_use_guidelines} 165</appropriate_tool_use_guidelines> 166</eval_data> 167""" 168 inappropriate_tool_use_guidelines = str( 169 self.eval.template_properties.get("inappropriate_tool_use_guidelines") or "" 170 ) 171 # Only include if it has content since it is optional 172 if inappropriate_tool_use_guidelines: 173 description += """The model was given the following <inappropriate_tool_use_guidelines> guidelines:""" 174 description += f""" 175<eval_data> 176<inappropriate_tool_use_guidelines> 177{inappropriate_tool_use_guidelines} 178</inappropriate_tool_use_guidelines> 179</eval_data> 180""" 181 182 if available_tools is not None: 183 if available_tools != "": 184 description += f""" 185This is the list of tools available to the model: 186<eval_data> 187<available_tools>{available_tools}</available_tools> 188</eval_data> 189""" 190 else: 191 description += """ 192There were no tools available to the model. 193""" 194 195 description += f""" 196This is the full conversation history for the task run: 197<eval_data> 198<conversation_history>{conversation_history}</conversation_history> 199</eval_data> 200""" 201 return description
203 async def run_eval( 204 self, task_run: TaskRun, eval_job_item: TaskRun | None = None 205 ) -> tuple[EvalScores, Dict[str, str] | None]: 206 """ 207 Run this eval on the given task run. 208 """ 209 210 model_name, provider = self.model_and_provider() 211 212 # Only fetch logprobs for G-Eval 213 # There are at most 5 valid rating tokens per rating type (five_star being largest), so 10 is more than enough to get to the very very unlikely 214 top_logprobs = ( 215 10 if self.eval_config.config_type == EvalConfigType.g_eval else None 216 ) 217 218 # We don't expose setting this manually in the UI, so pull a recommended mode from ml_model_list 219 structured_output_mode = default_structured_output_mode_for_model_provider( 220 model_name, 221 provider, 222 default=StructuredOutputMode.json_schema, 223 # G-eval expects JSON, so don't allow function calling modes 224 disallowed_modes=[ 225 StructuredOutputMode.function_calling, 226 StructuredOutputMode.function_calling_weak, 227 ], 228 ) 229 230 adapter = adapter_for_task( 231 self.geval_task, 232 run_config_properties=RunConfigProperties( 233 model_name=model_name, 234 model_provider_name=provider, 235 # We always use Simple COT for G-Eval and LLM as Judge 236 prompt_id=PromptGenerators.SIMPLE_CHAIN_OF_THOUGHT, 237 structured_output_mode=structured_output_mode, 238 ), 239 base_adapter_config=AdapterConfig( 240 # Don't save this run into the task_runs. It will be saved into an eval_run where it belongs 241 allow_saving=False, 242 top_logprobs=top_logprobs, 243 ), 244 ) 245 246 if self.eval.evaluation_data_type == EvalDataType.full_trace: 247 if task_run.trace is None: 248 raise ValueError("Task run trace is required for full trace evaluation") 249 250 available_tools = await EvalUtils.formatted_available_tools_from_task_run( 251 task_run 252 ) 253 run_description = self.generate_full_trace_run_description( 254 task_run.input, 255 available_tools, 256 EvalTraceFormatter.trace_to_formatted_conversation_history( 257 task_run.trace 258 ), 259 ) 260 261 elif self.eval.evaluation_data_type == EvalDataType.reference_answer: 262 if eval_job_item is None: 263 raise ValueError( 264 "Eval job item is required for reference answer evaluation" 265 ) 266 run_description = self.generate_ref_ans_run_description( 267 task_run.input, task_run.output.output, eval_job_item.output.output 268 ) 269 270 else: # EvalDataType.final_answer 271 run_description = self.generate_final_answer_run_description( 272 task_run.input, task_run.output.output 273 ) 274 275 # We don't need the run, but invoke_returning_run_output() runs validations for us over _run() 276 _, run_output = await adapter.invoke_returning_run_output(run_description) 277 278 if self.eval_config.config_type == EvalConfigType.llm_as_judge: 279 return self.build_llm_as_judge_score( 280 run_output 281 ), run_output.intermediate_outputs 282 else: 283 return self.build_g_eval_score(run_output), run_output.intermediate_outputs
Run this eval on the given task run.
285 def build_llm_as_judge_score(self, run_output: RunOutput) -> EvalScores: 286 """ 287 Build the LLM as Judge score for the given run and run output. 288 """ 289 # Convert the output format we asked for (discreet values) to our float scores 290 scores: EvalScores = {} 291 if not isinstance(run_output.output, dict): 292 raise ValueError("LLM as Judge output must be a dictionary") 293 294 for metric, score in run_output.output.items(): 295 token_score = self.score_from_token_string(f"{score}") 296 if token_score is None: 297 raise ValueError( 298 f"No score found for metric: {metric}. The LLM failed to follow the scoring rubric/instructions/schema." 299 ) 300 scores[metric] = token_score 301 return scores
Build the LLM as Judge score for the given run and run output.
303 def build_g_eval_score(self, run_output: RunOutput) -> EvalScores: 304 """ 305 Build the G-Eval score for the given run and run output. 306 307 We create a weighted average of each rating using the logprobs. 308 309 @misc{liu2023gevalnlgevaluationusing, 310 title={G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment}, 311 author={Yang Liu and Dan Iter and Yichong Xu and Shuohang Wang and Ruochen Xu and Chenguang Zhu}, 312 year={2023}, 313 eprint={2303.16634}, 314 archivePrefix={arXiv}, 315 primaryClass={cs.CL}, 316 url={https://arxiv.org/abs/2303.16634}, 317 } 318 """ 319 # We use structured output 320 outputs = run_output.output 321 assert isinstance(outputs, dict) 322 323 # Build raw string output from the logprobs, which is easier to work with than Dict for the next bit 324 raw_output = self.raw_output_from_logprobs(run_output) 325 326 # find the offset the start of each metric in the raw output json 327 metrics: List[str] = list(outputs.keys()) 328 metric_offsets = self.metric_offsets(raw_output, metrics) 329 330 final_scores: EvalScores = {} 331 for metric in metrics: 332 score = self.g_eval_single_metric( 333 run_output, metric, metric_offsets, raw_output 334 ) 335 if score is None: 336 raise ValueError( 337 f"No score found for metric: {metric}. The LLM failed to follow the scoring rubric/instructions/schema." 338 ) 339 final_scores[metric] = score 340 341 return final_scores
Build the G-Eval score for the given run and run output.
We create a weighted average of each rating using the logprobs.
@misc{liu2023gevalnlgevaluationusing, title={G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment}, author={Yang Liu and Dan Iter and Yichong Xu and Shuohang Wang and Ruochen Xu and Chenguang Zhu}, year={2023}, eprint={2303.16634}, archivePrefix={arXiv}, primaryClass={cs.CL}, url={https://arxiv.org/abs/2303.16634}, }
343 def g_eval_single_metric( 344 self, 345 run_output: RunOutput, 346 metric: str, 347 metric_offsets: Dict[str, int], 348 raw_output: str, 349 ) -> float | None: 350 """ 351 Run the G-Eval for a single metric. 352 353 Scan the logprobs for the metric and return the weighted score of the rating token. 354 """ 355 356 start_offset, end_offset = self.token_search_range( 357 raw_output, metric, metric_offsets 358 ) 359 360 offset = 0 361 362 if ( 363 run_output.output_logprobs is None 364 or run_output.output_logprobs.content is None 365 ): 366 raise RuntimeError( 367 "No logprobs found for output - can not calculate g-eval" 368 ) 369 370 # scan the tokens in the range, looking for the rating token 371 for _, chat_logprob in enumerate(run_output.output_logprobs.content): 372 if offset >= end_offset: 373 break 374 if offset >= start_offset: 375 score = self.rating_token_to_score(chat_logprob) 376 if score is not None: 377 return score 378 offset += len(chat_logprob.token) 379 380 return None
Run the G-Eval for a single metric.
Scan the logprobs for the metric and return the weighted score of the rating token.
382 def raw_output_from_logprobs(self, run_output: RunOutput) -> str: 383 """ 384 Build the raw output string from the logprobs. Generate from logprobs so it's guaranteed to match the logprobs offsets 385 """ 386 if ( 387 run_output.output_logprobs is None 388 or run_output.output_logprobs.content is None 389 ): 390 raise RuntimeError( 391 "No logprobs found for output - can not calculate g-eval" 392 ) 393 394 raw = "" 395 for chat_logprob in run_output.output_logprobs.content: 396 raw += chat_logprob.token 397 return raw
Build the raw output string from the logprobs. Generate from logprobs so it's guaranteed to match the logprobs offsets
399 def token_search_range( 400 self, raw_output: str, metric: str, metric_offsets: Dict[str, int] 401 ) -> Tuple[int, int]: 402 """ 403 Find the start and end offsets of the metric in the raw output. 404 405 Start searching after the end of the target metric json entry ("overall_rating":), and before the start of the next metric ("some_other_score"). 406 """ 407 start_offset = metric_offsets[metric] + len(metric) 408 409 # Find the lowest end offset that is greater than the start offset 410 end_offset = len(raw_output) 411 for v in list(metric_offsets.values()): 412 if v < end_offset and v > start_offset: 413 end_offset = v 414 415 return start_offset, end_offset
Find the start and end offsets of the metric in the raw output.
Start searching after the end of the target metric json entry ("overall_rating":), and before the start of the next metric ("some_other_score").
417 def rating_token_to_score( 418 self, token_logprob: ChatCompletionTokenLogprob 419 ) -> float | None: 420 """ 421 Convert a rating token to a score using weighted average of top logprobs. 422 423 Only includes tokens that have valid scores. 424 425 Some cleanup for upper case, whitespace and quotes. LLMs aren't always consistent. 426 """ 427 primary_token_score = self.score_from_token_string(token_logprob.token) 428 # check this is a real rating token, it could just be the ": ", "," or whitespace 429 if primary_token_score is None: 430 return None 431 432 total_score = 0.0 433 total_probability = 0.0 434 top_logprobs_contains_primary_token = False 435 436 # Process all valid scoring tokens from alternatives 437 for top_logprob in token_logprob.top_logprobs: 438 if top_logprob.token == token_logprob.token: 439 top_logprobs_contains_primary_token = True 440 token_score = self.score_from_token_string(top_logprob.token) 441 if token_score is not None: 442 # Convert logprob to probability 443 probability = math.exp(top_logprob.logprob) 444 total_score += token_score * probability 445 total_probability += probability 446 447 # Weird OpenAI 4o bug - sometimes the primary token is included in the top logprobs, sometimes not. 448 # Add the primary token back in if excluded 449 if not top_logprobs_contains_primary_token: 450 if token_logprob.logprob == -9999.0: 451 # Another "bug" - sometimes the logprob is -9999.0. This seems to happen when the rest of the logprobs are tiny probability. 452 total_score += primary_token_score * 1.0 453 total_probability += 1.0 454 else: 455 probability = math.exp(token_logprob.logprob) 456 total_score += primary_token_score * probability 457 total_probability += probability 458 459 if total_probability <= 0.0: 460 raise RuntimeError( 461 f"No valid scoring tokens found for {token_logprob.token}. This should never happen as the token has a valid score (so it must be excluded from top logprobs). Please file a bug if you see this." 462 ) 463 464 # Normalize by total probability of valid tokens (LLM may have wanted to generate other non-rating tokens, these shouldn't lower score of rating tokens) 465 weighted_score = total_score / total_probability 466 467 return weighted_score
Convert a rating token to a score using weighted average of top logprobs.
Only includes tokens that have valid scores.
Some cleanup for upper case, whitespace and quotes. LLMs aren't always consistent.
469 def score_from_token_string(self, token: str) -> float | None: 470 if token in TOKEN_TO_SCORE_MAP: 471 return TOKEN_TO_SCORE_MAP[token] 472 473 # handle more token variations like '"1"' and '"pass"' and ' paSS' and 'PASS' 474 unquoted_token = token.strip().strip('"').lower() 475 if unquoted_token in TOKEN_TO_SCORE_MAP: 476 return TOKEN_TO_SCORE_MAP[unquoted_token] 477 478 # handle numeric tokens like "1.0" 479 try: 480 float_value = float(token) 481 if float_value.is_integer(): 482 str_token = str(int(float_value)) 483 if str_token in TOKEN_TO_SCORE_MAP: 484 return TOKEN_TO_SCORE_MAP[str_token] 485 except ValueError: 486 pass 487 488 return None
490 def metric_offsets(self, raw_output: str, metrics: List[str]) -> Dict[str, int]: 491 """ 492 Find the offset to the start of each metric in the raw output json 493 494 For the example json: `{"overall_rating": 1}` == 1 495 496 should return: 497 { 498 "overall_rating": 1 # it's 1 character into the json string 499 } 500 """ 501 metric_offsets: Dict[str, int] = {} 502 for metric in metrics: 503 # the quoted metric name is expected in the json: `{"overall_rating": 1}` == 1 504 metric_name = f'"{metric}"' 505 506 # we expect it exactly once 507 count = raw_output.count(metric_name) 508 if count != 1: 509 raise ValueError( 510 f"Metric {metric} should appear exactly once in the output. Found {count} times" 511 ) 512 513 offset = raw_output.find(metric_name) 514 if offset == -1: 515 raise ValueError(f"Metric {metric} not found in raw output") 516 metric_offsets[metric] = offset 517 return metric_offsets
Find the offset to the start of each metric in the raw output json
For the example json: {"overall_rating": 1} == 1
should return: { "overall_rating": 1 # it's 1 character into the json string }