kiln_ai.adapters.eval.g_eval
1import math 2from typing import Dict, List, Tuple 3 4from litellm.types.utils import ChatCompletionTokenLogprob 5 6from kiln_ai.adapters.adapter_registry import adapter_for_task 7from kiln_ai.adapters.eval.base_eval import BaseEval 8from kiln_ai.adapters.model_adapters.base_adapter import AdapterConfig, RunOutput 9from kiln_ai.adapters.prompt_builders import PromptGenerators 10from kiln_ai.datamodel import Project, Task, TaskRun 11from kiln_ai.datamodel.eval import EvalConfig, EvalConfigType, EvalScores 12from kiln_ai.datamodel.task import RunConfig 13 14# all the tokens we score for, and their float scores. 15TOKEN_TO_SCORE_MAP: Dict[str, float] = { 16 "1": 1.0, 17 "2": 2.0, 18 "3": 3.0, 19 "4": 4.0, 20 "5": 5.0, 21 "pass": 1.0, 22 "fail": 0.0, 23 "critical": -1.0, 24} 25 26 27class GEvalTask(Task, parent_of={}): 28 """ 29 Kiln task for executing a G-Eval. Can be run on any Kiln adapter which supports logprobs. 30 31 Note G-Eval implements both G-Eval and LLM as Judge as they are very similar. 32 """ 33 34 def __init__(self, eval_config: EvalConfig): 35 tmp_project = Project(name="GEval") 36 37 # Build a simple LLM as Judge system instruction 38 system_instruction = "Your job to evaluate a model's performance on a task. Blocks will be marked with <eval_data> tags.\n" 39 # Optionally add a short task description 40 task_description = eval_config.properties.get("task_description", None) 41 if task_description: 42 system_instruction += f"\nThe task the model was given is as follows:\n<eval_data>\n{task_description}\n</eval_data>\n" 43 44 # Build the COT eval instructions 45 cot_instructions = "First, think step by step about the model's performance following these evaluation steps:\n\n" 46 steps = eval_config.properties.get("eval_steps", None) 47 if not steps or not isinstance(steps, list): 48 raise ValueError("eval_steps must be a list") 49 for i, step in enumerate(steps): 50 cot_instructions += f"{i + 1}) {step}\n" 51 52 eval = eval_config.parent_eval() 53 if not eval: 54 raise ValueError("Eval config must have a parent eval") 55 56 # Build the output schema from the eval's target output scores. 57 # We restrict the LLM's output scoring schema to discrete scores (pass/fail/critical/1-5) - allow_float_scores=False 58 # However, the final scores from the evaluator can be a float (see later logprob calculation, which requires discrete token outputs) 59 output_schema = BaseEval.build_score_schema(eval, allow_float_scores=False) 60 61 super().__init__( 62 name="GEval Task", 63 parent=tmp_project, 64 instruction=system_instruction, 65 thinking_instruction=cot_instructions, 66 output_json_schema=output_schema, 67 ) 68 69 70class GEval(BaseEval): 71 """ 72 A evaluator which implements G-Eval and LLM as Judge. 73 74 G-Eval is a method of evaluating the quality of a model's output. It is a weighted average of the scores of the tokens in the output. The weights are the log probabilities of the tokens in the output. https://arxiv.org/abs/2303.16634 75 76 LLM as Judge is a method of evaluating the quality of a model's output. It simply asks the LLM to score, and uses the returned output (no logprobs needed). Also called direct evaluation. 77 78 @misc{liu2023gevalnlgevaluationusing, 79 title={G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment}, 80 author={Yang Liu and Dan Iter and Yichong Xu and Shuohang Wang and Ruochen Xu and Chenguang Zhu}, 81 year={2023}, 82 eprint={2303.16634}, 83 archivePrefix={arXiv}, 84 primaryClass={cs.CL}, 85 url={https://arxiv.org/abs/2303.16634}, 86 } 87 """ 88 89 def __init__(self, eval_config: EvalConfig, run_config: RunConfig | None): 90 if ( 91 eval_config.config_type != EvalConfigType.g_eval 92 and eval_config.config_type != EvalConfigType.llm_as_judge 93 ): 94 raise ValueError( 95 f"GEval must be initialized with a GEval or LLM as Judge config_type. Got {eval_config.config_type}" 96 ) 97 98 super().__init__(eval_config, run_config) 99 100 self.geval_task = GEvalTask(eval_config) 101 102 async def run_eval( 103 self, task_run: TaskRun 104 ) -> tuple[EvalScores, Dict[str, str] | None]: 105 """ 106 Run this eval on the given task run. 107 """ 108 109 model_name, provider = self.model_and_provider() 110 111 # Only fetch logprobs for G-Eval 112 # There are at most 5 valid rating tokens per rating type (five_star being largest), so 10 is more than enough to get to the very very unlikely 113 top_logprobs = ( 114 10 if self.eval_config.config_type == EvalConfigType.g_eval else None 115 ) 116 117 adapter = adapter_for_task( 118 self.geval_task, 119 model_name, 120 provider, 121 # We always use Simple COT for G-Eval and LLM as Judge 122 prompt_id=PromptGenerators.SIMPLE_CHAIN_OF_THOUGHT, 123 base_adapter_config=AdapterConfig( 124 # Don't save this run into the task_runs. It will be saved into an eval_run where it belongs 125 allow_saving=False, 126 top_logprobs=top_logprobs, 127 ), 128 ) 129 130 input = f"""The model was given the following input for the task: 131<eval_data> 132{task_run.input} 133</eval_data> 134 135The model produced the following output for the task: 136<eval_data> 137{task_run.output} 138</eval_data> 139""" 140 141 # We don't need the run, but invoke_returning_run_output() runs validations for us over _run() 142 _, run_output = await adapter.invoke_returning_run_output(input) 143 144 if self.eval_config.config_type == EvalConfigType.llm_as_judge: 145 return self.build_llm_as_judge_score( 146 run_output 147 ), run_output.intermediate_outputs 148 else: 149 return self.build_g_eval_score(run_output), run_output.intermediate_outputs 150 151 def build_llm_as_judge_score(self, run_output: RunOutput) -> EvalScores: 152 """ 153 Build the LLM as Judge score for the given run and run output. 154 """ 155 # Convert the output format we asked for (discreet values) to our float scores 156 scores: EvalScores = {} 157 if not isinstance(run_output.output, dict): 158 raise ValueError("LLM as Judge output must be a dictionary") 159 160 for metric, score in run_output.output.items(): 161 token_score = self.score_from_token_string(f"{score}") 162 if token_score is None: 163 raise ValueError( 164 f"No score found for metric: {metric}. The LLM failed to follow the scoring rubric/instructions/schema." 165 ) 166 scores[metric] = token_score 167 return scores 168 169 def build_g_eval_score(self, run_output: RunOutput) -> EvalScores: 170 """ 171 Build the G-Eval score for the given run and run output. 172 173 We create a weighted average of each rating using the logprobs. 174 175 @misc{liu2023gevalnlgevaluationusing, 176 title={G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment}, 177 author={Yang Liu and Dan Iter and Yichong Xu and Shuohang Wang and Ruochen Xu and Chenguang Zhu}, 178 year={2023}, 179 eprint={2303.16634}, 180 archivePrefix={arXiv}, 181 primaryClass={cs.CL}, 182 url={https://arxiv.org/abs/2303.16634}, 183 } 184 """ 185 # We use structured output 186 outputs = run_output.output 187 assert isinstance(outputs, dict) 188 189 # Build raw string output from the logprobs, which is easier to work with than Dict for the next bit 190 raw_output = self.raw_output_from_logprobs(run_output) 191 192 # find the offset the start of each metric in the raw output json 193 metrics: List[str] = list(outputs.keys()) 194 metric_offsets = self.metric_offsets(raw_output, metrics) 195 196 final_scores: EvalScores = {} 197 for metric in metrics: 198 score = self.g_eval_single_metric( 199 run_output, metric, metric_offsets, raw_output 200 ) 201 if score is None: 202 raise ValueError( 203 f"No score found for metric: {metric}. The LLM failed to follow the scoring rubric/instructions/schema." 204 ) 205 final_scores[metric] = score 206 207 return final_scores 208 209 def g_eval_single_metric( 210 self, 211 run_output: RunOutput, 212 metric: str, 213 metric_offsets: Dict[str, int], 214 raw_output: str, 215 ) -> float | None: 216 """ 217 Run the G-Eval for a single metric. 218 219 Scan the logprobs for the metric and return the weighted score of the rating token. 220 """ 221 222 start_offset, end_offset = self.token_search_range( 223 raw_output, metric, metric_offsets 224 ) 225 226 offset = 0 227 228 if ( 229 run_output.output_logprobs is None 230 or run_output.output_logprobs.content is None 231 ): 232 raise RuntimeError( 233 "No logprobs found for output - can not calculate g-eval" 234 ) 235 236 # scan the tokens in the range, looking for the rating token 237 for _, chat_logprob in enumerate(run_output.output_logprobs.content): 238 if offset >= end_offset: 239 break 240 if offset >= start_offset: 241 score = self.rating_token_to_score(chat_logprob) 242 if score is not None: 243 return score 244 offset += len(chat_logprob.token) 245 246 return None 247 248 def raw_output_from_logprobs(self, run_output: RunOutput) -> str: 249 """ 250 Build the raw output string from the logprobs. Generate from logprobs so it's guaranteed to match the logprobs offsets 251 """ 252 if ( 253 run_output.output_logprobs is None 254 or run_output.output_logprobs.content is None 255 ): 256 raise RuntimeError( 257 "No logprobs found for output - can not calculate g-eval" 258 ) 259 260 raw = "" 261 for chat_logprob in run_output.output_logprobs.content: 262 raw += chat_logprob.token 263 return raw 264 265 def token_search_range( 266 self, raw_output: str, metric: str, metric_offsets: Dict[str, int] 267 ) -> Tuple[int, int]: 268 """ 269 Find the start and end offsets of the metric in the raw output. 270 271 Start searching after the end of the target metric json entry ("overall_rating":), and before the start of the next metric ("some_other_score"). 272 """ 273 start_offset = metric_offsets[metric] + len(metric) 274 275 # Find the lowest end offset that is greater than the start offset 276 end_offset = len(raw_output) 277 for v in list(metric_offsets.values()): 278 if v < end_offset and v > start_offset: 279 end_offset = v 280 281 return start_offset, end_offset 282 283 def rating_token_to_score( 284 self, token_logprob: ChatCompletionTokenLogprob 285 ) -> float | None: 286 """ 287 Convert a rating token to a score using weighted average of top logprobs. 288 289 Only includes tokens that have valid scores. 290 291 Some cleanup for upper case, whitespace and quotes. LLMs aren't always consistent. 292 """ 293 primary_token_score = self.score_from_token_string(token_logprob.token) 294 # check this is a real rating token, it could just be the ": ", "," or whitespace 295 if not primary_token_score: 296 return None 297 298 total_score = 0.0 299 total_probability = 0.0 300 top_logprobs_contains_primary_token = False 301 302 # Process all valid scoring tokens from alternatives 303 for top_logprob in token_logprob.top_logprobs: 304 if top_logprob.token == token_logprob.token: 305 top_logprobs_contains_primary_token = True 306 token_score = self.score_from_token_string(top_logprob.token) 307 if token_score is not None: 308 # Convert logprob to probability 309 probability = math.exp(top_logprob.logprob) 310 total_score += token_score * probability 311 total_probability += probability 312 313 # Weird OpenAI 4o bug - sometimes the primary token is included in the top logprobs, sometimes not. 314 # Add the primary token back in if excluded 315 if not top_logprobs_contains_primary_token: 316 if token_logprob.logprob == -9999.0: 317 # Another "bug" - sometimes the logprob is -9999.0. This seems to happen when the rest of the logprobs are tiny probability. 318 total_score += primary_token_score * 1.0 319 total_probability += 1.0 320 else: 321 probability = math.exp(token_logprob.logprob) 322 total_score += primary_token_score * probability 323 total_probability += probability 324 325 if total_probability <= 0.0: 326 raise RuntimeError( 327 f"No valid scoring tokens found for {token_logprob.token}. This should never happen as the token has a valid score (so it must be excluded from top logprobs). Please file a bug if you see this." 328 ) 329 330 # Normalize by total probability of valid tokens (LLM may have wanted to generate other non-rating tokens, these shouldn't lower score of rating tokens) 331 weighted_score = total_score / total_probability 332 333 return weighted_score 334 335 def score_from_token_string(self, token: str) -> float | None: 336 if token in TOKEN_TO_SCORE_MAP: 337 return TOKEN_TO_SCORE_MAP[token] 338 339 # handle more token variations like '"1"' and '"pass"' and ' paSS' and 'PASS' 340 unquoted_token = token.strip().strip('"').lower() 341 if unquoted_token in TOKEN_TO_SCORE_MAP: 342 return TOKEN_TO_SCORE_MAP[unquoted_token] 343 344 # handle numeric tokens like "1.0" 345 try: 346 float_value = float(token) 347 if float_value.is_integer(): 348 str_token = str(int(float_value)) 349 if str_token in TOKEN_TO_SCORE_MAP: 350 return TOKEN_TO_SCORE_MAP[str_token] 351 except ValueError: 352 pass 353 354 return None 355 356 def metric_offsets(self, raw_output: str, metrics: List[str]) -> Dict[str, int]: 357 """ 358 Find the offset to the start of each metric in the raw output json 359 360 For the example json: `{"overall_rating": 1}` == 1 361 362 should return: 363 { 364 "overall_rating": 1 # it's 1 character into the json string 365 } 366 """ 367 metric_offsets: Dict[str, int] = {} 368 for metric in metrics: 369 # the quoted metric name is expected in the json: `{"overall_rating": 1}` == 1 370 metric_name = f'"{metric}"' 371 372 # we expect it exactly once 373 count = raw_output.count(metric_name) 374 if count != 1: 375 raise ValueError( 376 f"Metric {metric} should appear exactly once in the output. Found {count} times" 377 ) 378 379 offset = raw_output.find(metric_name) 380 if offset == -1: 381 raise ValueError(f"Metric {metric} not found in raw output") 382 metric_offsets[metric] = offset 383 return metric_offsets
28class GEvalTask(Task, parent_of={}): 29 """ 30 Kiln task for executing a G-Eval. Can be run on any Kiln adapter which supports logprobs. 31 32 Note G-Eval implements both G-Eval and LLM as Judge as they are very similar. 33 """ 34 35 def __init__(self, eval_config: EvalConfig): 36 tmp_project = Project(name="GEval") 37 38 # Build a simple LLM as Judge system instruction 39 system_instruction = "Your job to evaluate a model's performance on a task. Blocks will be marked with <eval_data> tags.\n" 40 # Optionally add a short task description 41 task_description = eval_config.properties.get("task_description", None) 42 if task_description: 43 system_instruction += f"\nThe task the model was given is as follows:\n<eval_data>\n{task_description}\n</eval_data>\n" 44 45 # Build the COT eval instructions 46 cot_instructions = "First, think step by step about the model's performance following these evaluation steps:\n\n" 47 steps = eval_config.properties.get("eval_steps", None) 48 if not steps or not isinstance(steps, list): 49 raise ValueError("eval_steps must be a list") 50 for i, step in enumerate(steps): 51 cot_instructions += f"{i + 1}) {step}\n" 52 53 eval = eval_config.parent_eval() 54 if not eval: 55 raise ValueError("Eval config must have a parent eval") 56 57 # Build the output schema from the eval's target output scores. 58 # We restrict the LLM's output scoring schema to discrete scores (pass/fail/critical/1-5) - allow_float_scores=False 59 # However, the final scores from the evaluator can be a float (see later logprob calculation, which requires discrete token outputs) 60 output_schema = BaseEval.build_score_schema(eval, allow_float_scores=False) 61 62 super().__init__( 63 name="GEval Task", 64 parent=tmp_project, 65 instruction=system_instruction, 66 thinking_instruction=cot_instructions, 67 output_json_schema=output_schema, 68 )
Kiln task for executing a G-Eval. Can be run on any Kiln adapter which supports logprobs.
Note G-Eval implements both G-Eval and LLM as Judge as they are very similar.
35 def __init__(self, eval_config: EvalConfig): 36 tmp_project = Project(name="GEval") 37 38 # Build a simple LLM as Judge system instruction 39 system_instruction = "Your job to evaluate a model's performance on a task. Blocks will be marked with <eval_data> tags.\n" 40 # Optionally add a short task description 41 task_description = eval_config.properties.get("task_description", None) 42 if task_description: 43 system_instruction += f"\nThe task the model was given is as follows:\n<eval_data>\n{task_description}\n</eval_data>\n" 44 45 # Build the COT eval instructions 46 cot_instructions = "First, think step by step about the model's performance following these evaluation steps:\n\n" 47 steps = eval_config.properties.get("eval_steps", None) 48 if not steps or not isinstance(steps, list): 49 raise ValueError("eval_steps must be a list") 50 for i, step in enumerate(steps): 51 cot_instructions += f"{i + 1}) {step}\n" 52 53 eval = eval_config.parent_eval() 54 if not eval: 55 raise ValueError("Eval config must have a parent eval") 56 57 # Build the output schema from the eval's target output scores. 58 # We restrict the LLM's output scoring schema to discrete scores (pass/fail/critical/1-5) - allow_float_scores=False 59 # However, the final scores from the evaluator can be a float (see later logprob calculation, which requires discrete token outputs) 60 output_schema = BaseEval.build_score_schema(eval, allow_float_scores=False) 61 62 super().__init__( 63 name="GEval Task", 64 parent=tmp_project, 65 instruction=system_instruction, 66 thinking_instruction=cot_instructions, 67 output_json_schema=output_schema, 68 )
Create a new model by parsing and validating input data from keyword arguments.
Raises [ValidationError
][pydantic_core.ValidationError] if the input data cannot be
validated to form a valid model.
self
is explicitly positional-only to allow self
as a field name.
Configuration for the model, should be a dictionary conforming to [ConfigDict
][pydantic.config.ConfigDict].
122 def wrapped_model_post_init(self: BaseModel, context: Any, /) -> None: 123 """We need to both initialize private attributes and call the user-defined model_post_init 124 method. 125 """ 126 init_private_attributes(self, context) 127 original_model_post_init(self, context)
We need to both initialize private attributes and call the user-defined model_post_init method.
71class GEval(BaseEval): 72 """ 73 A evaluator which implements G-Eval and LLM as Judge. 74 75 G-Eval is a method of evaluating the quality of a model's output. It is a weighted average of the scores of the tokens in the output. The weights are the log probabilities of the tokens in the output. https://arxiv.org/abs/2303.16634 76 77 LLM as Judge is a method of evaluating the quality of a model's output. It simply asks the LLM to score, and uses the returned output (no logprobs needed). Also called direct evaluation. 78 79 @misc{liu2023gevalnlgevaluationusing, 80 title={G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment}, 81 author={Yang Liu and Dan Iter and Yichong Xu and Shuohang Wang and Ruochen Xu and Chenguang Zhu}, 82 year={2023}, 83 eprint={2303.16634}, 84 archivePrefix={arXiv}, 85 primaryClass={cs.CL}, 86 url={https://arxiv.org/abs/2303.16634}, 87 } 88 """ 89 90 def __init__(self, eval_config: EvalConfig, run_config: RunConfig | None): 91 if ( 92 eval_config.config_type != EvalConfigType.g_eval 93 and eval_config.config_type != EvalConfigType.llm_as_judge 94 ): 95 raise ValueError( 96 f"GEval must be initialized with a GEval or LLM as Judge config_type. Got {eval_config.config_type}" 97 ) 98 99 super().__init__(eval_config, run_config) 100 101 self.geval_task = GEvalTask(eval_config) 102 103 async def run_eval( 104 self, task_run: TaskRun 105 ) -> tuple[EvalScores, Dict[str, str] | None]: 106 """ 107 Run this eval on the given task run. 108 """ 109 110 model_name, provider = self.model_and_provider() 111 112 # Only fetch logprobs for G-Eval 113 # There are at most 5 valid rating tokens per rating type (five_star being largest), so 10 is more than enough to get to the very very unlikely 114 top_logprobs = ( 115 10 if self.eval_config.config_type == EvalConfigType.g_eval else None 116 ) 117 118 adapter = adapter_for_task( 119 self.geval_task, 120 model_name, 121 provider, 122 # We always use Simple COT for G-Eval and LLM as Judge 123 prompt_id=PromptGenerators.SIMPLE_CHAIN_OF_THOUGHT, 124 base_adapter_config=AdapterConfig( 125 # Don't save this run into the task_runs. It will be saved into an eval_run where it belongs 126 allow_saving=False, 127 top_logprobs=top_logprobs, 128 ), 129 ) 130 131 input = f"""The model was given the following input for the task: 132<eval_data> 133{task_run.input} 134</eval_data> 135 136The model produced the following output for the task: 137<eval_data> 138{task_run.output} 139</eval_data> 140""" 141 142 # We don't need the run, but invoke_returning_run_output() runs validations for us over _run() 143 _, run_output = await adapter.invoke_returning_run_output(input) 144 145 if self.eval_config.config_type == EvalConfigType.llm_as_judge: 146 return self.build_llm_as_judge_score( 147 run_output 148 ), run_output.intermediate_outputs 149 else: 150 return self.build_g_eval_score(run_output), run_output.intermediate_outputs 151 152 def build_llm_as_judge_score(self, run_output: RunOutput) -> EvalScores: 153 """ 154 Build the LLM as Judge score for the given run and run output. 155 """ 156 # Convert the output format we asked for (discreet values) to our float scores 157 scores: EvalScores = {} 158 if not isinstance(run_output.output, dict): 159 raise ValueError("LLM as Judge output must be a dictionary") 160 161 for metric, score in run_output.output.items(): 162 token_score = self.score_from_token_string(f"{score}") 163 if token_score is None: 164 raise ValueError( 165 f"No score found for metric: {metric}. The LLM failed to follow the scoring rubric/instructions/schema." 166 ) 167 scores[metric] = token_score 168 return scores 169 170 def build_g_eval_score(self, run_output: RunOutput) -> EvalScores: 171 """ 172 Build the G-Eval score for the given run and run output. 173 174 We create a weighted average of each rating using the logprobs. 175 176 @misc{liu2023gevalnlgevaluationusing, 177 title={G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment}, 178 author={Yang Liu and Dan Iter and Yichong Xu and Shuohang Wang and Ruochen Xu and Chenguang Zhu}, 179 year={2023}, 180 eprint={2303.16634}, 181 archivePrefix={arXiv}, 182 primaryClass={cs.CL}, 183 url={https://arxiv.org/abs/2303.16634}, 184 } 185 """ 186 # We use structured output 187 outputs = run_output.output 188 assert isinstance(outputs, dict) 189 190 # Build raw string output from the logprobs, which is easier to work with than Dict for the next bit 191 raw_output = self.raw_output_from_logprobs(run_output) 192 193 # find the offset the start of each metric in the raw output json 194 metrics: List[str] = list(outputs.keys()) 195 metric_offsets = self.metric_offsets(raw_output, metrics) 196 197 final_scores: EvalScores = {} 198 for metric in metrics: 199 score = self.g_eval_single_metric( 200 run_output, metric, metric_offsets, raw_output 201 ) 202 if score is None: 203 raise ValueError( 204 f"No score found for metric: {metric}. The LLM failed to follow the scoring rubric/instructions/schema." 205 ) 206 final_scores[metric] = score 207 208 return final_scores 209 210 def g_eval_single_metric( 211 self, 212 run_output: RunOutput, 213 metric: str, 214 metric_offsets: Dict[str, int], 215 raw_output: str, 216 ) -> float | None: 217 """ 218 Run the G-Eval for a single metric. 219 220 Scan the logprobs for the metric and return the weighted score of the rating token. 221 """ 222 223 start_offset, end_offset = self.token_search_range( 224 raw_output, metric, metric_offsets 225 ) 226 227 offset = 0 228 229 if ( 230 run_output.output_logprobs is None 231 or run_output.output_logprobs.content is None 232 ): 233 raise RuntimeError( 234 "No logprobs found for output - can not calculate g-eval" 235 ) 236 237 # scan the tokens in the range, looking for the rating token 238 for _, chat_logprob in enumerate(run_output.output_logprobs.content): 239 if offset >= end_offset: 240 break 241 if offset >= start_offset: 242 score = self.rating_token_to_score(chat_logprob) 243 if score is not None: 244 return score 245 offset += len(chat_logprob.token) 246 247 return None 248 249 def raw_output_from_logprobs(self, run_output: RunOutput) -> str: 250 """ 251 Build the raw output string from the logprobs. Generate from logprobs so it's guaranteed to match the logprobs offsets 252 """ 253 if ( 254 run_output.output_logprobs is None 255 or run_output.output_logprobs.content is None 256 ): 257 raise RuntimeError( 258 "No logprobs found for output - can not calculate g-eval" 259 ) 260 261 raw = "" 262 for chat_logprob in run_output.output_logprobs.content: 263 raw += chat_logprob.token 264 return raw 265 266 def token_search_range( 267 self, raw_output: str, metric: str, metric_offsets: Dict[str, int] 268 ) -> Tuple[int, int]: 269 """ 270 Find the start and end offsets of the metric in the raw output. 271 272 Start searching after the end of the target metric json entry ("overall_rating":), and before the start of the next metric ("some_other_score"). 273 """ 274 start_offset = metric_offsets[metric] + len(metric) 275 276 # Find the lowest end offset that is greater than the start offset 277 end_offset = len(raw_output) 278 for v in list(metric_offsets.values()): 279 if v < end_offset and v > start_offset: 280 end_offset = v 281 282 return start_offset, end_offset 283 284 def rating_token_to_score( 285 self, token_logprob: ChatCompletionTokenLogprob 286 ) -> float | None: 287 """ 288 Convert a rating token to a score using weighted average of top logprobs. 289 290 Only includes tokens that have valid scores. 291 292 Some cleanup for upper case, whitespace and quotes. LLMs aren't always consistent. 293 """ 294 primary_token_score = self.score_from_token_string(token_logprob.token) 295 # check this is a real rating token, it could just be the ": ", "," or whitespace 296 if not primary_token_score: 297 return None 298 299 total_score = 0.0 300 total_probability = 0.0 301 top_logprobs_contains_primary_token = False 302 303 # Process all valid scoring tokens from alternatives 304 for top_logprob in token_logprob.top_logprobs: 305 if top_logprob.token == token_logprob.token: 306 top_logprobs_contains_primary_token = True 307 token_score = self.score_from_token_string(top_logprob.token) 308 if token_score is not None: 309 # Convert logprob to probability 310 probability = math.exp(top_logprob.logprob) 311 total_score += token_score * probability 312 total_probability += probability 313 314 # Weird OpenAI 4o bug - sometimes the primary token is included in the top logprobs, sometimes not. 315 # Add the primary token back in if excluded 316 if not top_logprobs_contains_primary_token: 317 if token_logprob.logprob == -9999.0: 318 # Another "bug" - sometimes the logprob is -9999.0. This seems to happen when the rest of the logprobs are tiny probability. 319 total_score += primary_token_score * 1.0 320 total_probability += 1.0 321 else: 322 probability = math.exp(token_logprob.logprob) 323 total_score += primary_token_score * probability 324 total_probability += probability 325 326 if total_probability <= 0.0: 327 raise RuntimeError( 328 f"No valid scoring tokens found for {token_logprob.token}. This should never happen as the token has a valid score (so it must be excluded from top logprobs). Please file a bug if you see this." 329 ) 330 331 # Normalize by total probability of valid tokens (LLM may have wanted to generate other non-rating tokens, these shouldn't lower score of rating tokens) 332 weighted_score = total_score / total_probability 333 334 return weighted_score 335 336 def score_from_token_string(self, token: str) -> float | None: 337 if token in TOKEN_TO_SCORE_MAP: 338 return TOKEN_TO_SCORE_MAP[token] 339 340 # handle more token variations like '"1"' and '"pass"' and ' paSS' and 'PASS' 341 unquoted_token = token.strip().strip('"').lower() 342 if unquoted_token in TOKEN_TO_SCORE_MAP: 343 return TOKEN_TO_SCORE_MAP[unquoted_token] 344 345 # handle numeric tokens like "1.0" 346 try: 347 float_value = float(token) 348 if float_value.is_integer(): 349 str_token = str(int(float_value)) 350 if str_token in TOKEN_TO_SCORE_MAP: 351 return TOKEN_TO_SCORE_MAP[str_token] 352 except ValueError: 353 pass 354 355 return None 356 357 def metric_offsets(self, raw_output: str, metrics: List[str]) -> Dict[str, int]: 358 """ 359 Find the offset to the start of each metric in the raw output json 360 361 For the example json: `{"overall_rating": 1}` == 1 362 363 should return: 364 { 365 "overall_rating": 1 # it's 1 character into the json string 366 } 367 """ 368 metric_offsets: Dict[str, int] = {} 369 for metric in metrics: 370 # the quoted metric name is expected in the json: `{"overall_rating": 1}` == 1 371 metric_name = f'"{metric}"' 372 373 # we expect it exactly once 374 count = raw_output.count(metric_name) 375 if count != 1: 376 raise ValueError( 377 f"Metric {metric} should appear exactly once in the output. Found {count} times" 378 ) 379 380 offset = raw_output.find(metric_name) 381 if offset == -1: 382 raise ValueError(f"Metric {metric} not found in raw output") 383 metric_offsets[metric] = offset 384 return metric_offsets
A evaluator which implements G-Eval and LLM as Judge.
G-Eval is a method of evaluating the quality of a model's output. It is a weighted average of the scores of the tokens in the output. The weights are the log probabilities of the tokens in the output. https://arxiv.org/abs/2303.16634
LLM as Judge is a method of evaluating the quality of a model's output. It simply asks the LLM to score, and uses the returned output (no logprobs needed). Also called direct evaluation.
@misc{liu2023gevalnlgevaluationusing, title={G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment}, author={Yang Liu and Dan Iter and Yichong Xu and Shuohang Wang and Ruochen Xu and Chenguang Zhu}, year={2023}, eprint={2303.16634}, archivePrefix={arXiv}, primaryClass={cs.CL}, url={https://arxiv.org/abs/2303.16634}, }
90 def __init__(self, eval_config: EvalConfig, run_config: RunConfig | None): 91 if ( 92 eval_config.config_type != EvalConfigType.g_eval 93 and eval_config.config_type != EvalConfigType.llm_as_judge 94 ): 95 raise ValueError( 96 f"GEval must be initialized with a GEval or LLM as Judge config_type. Got {eval_config.config_type}" 97 ) 98 99 super().__init__(eval_config, run_config) 100 101 self.geval_task = GEvalTask(eval_config)
103 async def run_eval( 104 self, task_run: TaskRun 105 ) -> tuple[EvalScores, Dict[str, str] | None]: 106 """ 107 Run this eval on the given task run. 108 """ 109 110 model_name, provider = self.model_and_provider() 111 112 # Only fetch logprobs for G-Eval 113 # There are at most 5 valid rating tokens per rating type (five_star being largest), so 10 is more than enough to get to the very very unlikely 114 top_logprobs = ( 115 10 if self.eval_config.config_type == EvalConfigType.g_eval else None 116 ) 117 118 adapter = adapter_for_task( 119 self.geval_task, 120 model_name, 121 provider, 122 # We always use Simple COT for G-Eval and LLM as Judge 123 prompt_id=PromptGenerators.SIMPLE_CHAIN_OF_THOUGHT, 124 base_adapter_config=AdapterConfig( 125 # Don't save this run into the task_runs. It will be saved into an eval_run where it belongs 126 allow_saving=False, 127 top_logprobs=top_logprobs, 128 ), 129 ) 130 131 input = f"""The model was given the following input for the task: 132<eval_data> 133{task_run.input} 134</eval_data> 135 136The model produced the following output for the task: 137<eval_data> 138{task_run.output} 139</eval_data> 140""" 141 142 # We don't need the run, but invoke_returning_run_output() runs validations for us over _run() 143 _, run_output = await adapter.invoke_returning_run_output(input) 144 145 if self.eval_config.config_type == EvalConfigType.llm_as_judge: 146 return self.build_llm_as_judge_score( 147 run_output 148 ), run_output.intermediate_outputs 149 else: 150 return self.build_g_eval_score(run_output), run_output.intermediate_outputs
Run this eval on the given task run.
152 def build_llm_as_judge_score(self, run_output: RunOutput) -> EvalScores: 153 """ 154 Build the LLM as Judge score for the given run and run output. 155 """ 156 # Convert the output format we asked for (discreet values) to our float scores 157 scores: EvalScores = {} 158 if not isinstance(run_output.output, dict): 159 raise ValueError("LLM as Judge output must be a dictionary") 160 161 for metric, score in run_output.output.items(): 162 token_score = self.score_from_token_string(f"{score}") 163 if token_score is None: 164 raise ValueError( 165 f"No score found for metric: {metric}. The LLM failed to follow the scoring rubric/instructions/schema." 166 ) 167 scores[metric] = token_score 168 return scores
Build the LLM as Judge score for the given run and run output.
170 def build_g_eval_score(self, run_output: RunOutput) -> EvalScores: 171 """ 172 Build the G-Eval score for the given run and run output. 173 174 We create a weighted average of each rating using the logprobs. 175 176 @misc{liu2023gevalnlgevaluationusing, 177 title={G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment}, 178 author={Yang Liu and Dan Iter and Yichong Xu and Shuohang Wang and Ruochen Xu and Chenguang Zhu}, 179 year={2023}, 180 eprint={2303.16634}, 181 archivePrefix={arXiv}, 182 primaryClass={cs.CL}, 183 url={https://arxiv.org/abs/2303.16634}, 184 } 185 """ 186 # We use structured output 187 outputs = run_output.output 188 assert isinstance(outputs, dict) 189 190 # Build raw string output from the logprobs, which is easier to work with than Dict for the next bit 191 raw_output = self.raw_output_from_logprobs(run_output) 192 193 # find the offset the start of each metric in the raw output json 194 metrics: List[str] = list(outputs.keys()) 195 metric_offsets = self.metric_offsets(raw_output, metrics) 196 197 final_scores: EvalScores = {} 198 for metric in metrics: 199 score = self.g_eval_single_metric( 200 run_output, metric, metric_offsets, raw_output 201 ) 202 if score is None: 203 raise ValueError( 204 f"No score found for metric: {metric}. The LLM failed to follow the scoring rubric/instructions/schema." 205 ) 206 final_scores[metric] = score 207 208 return final_scores
Build the G-Eval score for the given run and run output.
We create a weighted average of each rating using the logprobs.
@misc{liu2023gevalnlgevaluationusing, title={G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment}, author={Yang Liu and Dan Iter and Yichong Xu and Shuohang Wang and Ruochen Xu and Chenguang Zhu}, year={2023}, eprint={2303.16634}, archivePrefix={arXiv}, primaryClass={cs.CL}, url={https://arxiv.org/abs/2303.16634}, }
210 def g_eval_single_metric( 211 self, 212 run_output: RunOutput, 213 metric: str, 214 metric_offsets: Dict[str, int], 215 raw_output: str, 216 ) -> float | None: 217 """ 218 Run the G-Eval for a single metric. 219 220 Scan the logprobs for the metric and return the weighted score of the rating token. 221 """ 222 223 start_offset, end_offset = self.token_search_range( 224 raw_output, metric, metric_offsets 225 ) 226 227 offset = 0 228 229 if ( 230 run_output.output_logprobs is None 231 or run_output.output_logprobs.content is None 232 ): 233 raise RuntimeError( 234 "No logprobs found for output - can not calculate g-eval" 235 ) 236 237 # scan the tokens in the range, looking for the rating token 238 for _, chat_logprob in enumerate(run_output.output_logprobs.content): 239 if offset >= end_offset: 240 break 241 if offset >= start_offset: 242 score = self.rating_token_to_score(chat_logprob) 243 if score is not None: 244 return score 245 offset += len(chat_logprob.token) 246 247 return None
Run the G-Eval for a single metric.
Scan the logprobs for the metric and return the weighted score of the rating token.
249 def raw_output_from_logprobs(self, run_output: RunOutput) -> str: 250 """ 251 Build the raw output string from the logprobs. Generate from logprobs so it's guaranteed to match the logprobs offsets 252 """ 253 if ( 254 run_output.output_logprobs is None 255 or run_output.output_logprobs.content is None 256 ): 257 raise RuntimeError( 258 "No logprobs found for output - can not calculate g-eval" 259 ) 260 261 raw = "" 262 for chat_logprob in run_output.output_logprobs.content: 263 raw += chat_logprob.token 264 return raw
Build the raw output string from the logprobs. Generate from logprobs so it's guaranteed to match the logprobs offsets
266 def token_search_range( 267 self, raw_output: str, metric: str, metric_offsets: Dict[str, int] 268 ) -> Tuple[int, int]: 269 """ 270 Find the start and end offsets of the metric in the raw output. 271 272 Start searching after the end of the target metric json entry ("overall_rating":), and before the start of the next metric ("some_other_score"). 273 """ 274 start_offset = metric_offsets[metric] + len(metric) 275 276 # Find the lowest end offset that is greater than the start offset 277 end_offset = len(raw_output) 278 for v in list(metric_offsets.values()): 279 if v < end_offset and v > start_offset: 280 end_offset = v 281 282 return start_offset, end_offset
Find the start and end offsets of the metric in the raw output.
Start searching after the end of the target metric json entry ("overall_rating":), and before the start of the next metric ("some_other_score").
284 def rating_token_to_score( 285 self, token_logprob: ChatCompletionTokenLogprob 286 ) -> float | None: 287 """ 288 Convert a rating token to a score using weighted average of top logprobs. 289 290 Only includes tokens that have valid scores. 291 292 Some cleanup for upper case, whitespace and quotes. LLMs aren't always consistent. 293 """ 294 primary_token_score = self.score_from_token_string(token_logprob.token) 295 # check this is a real rating token, it could just be the ": ", "," or whitespace 296 if not primary_token_score: 297 return None 298 299 total_score = 0.0 300 total_probability = 0.0 301 top_logprobs_contains_primary_token = False 302 303 # Process all valid scoring tokens from alternatives 304 for top_logprob in token_logprob.top_logprobs: 305 if top_logprob.token == token_logprob.token: 306 top_logprobs_contains_primary_token = True 307 token_score = self.score_from_token_string(top_logprob.token) 308 if token_score is not None: 309 # Convert logprob to probability 310 probability = math.exp(top_logprob.logprob) 311 total_score += token_score * probability 312 total_probability += probability 313 314 # Weird OpenAI 4o bug - sometimes the primary token is included in the top logprobs, sometimes not. 315 # Add the primary token back in if excluded 316 if not top_logprobs_contains_primary_token: 317 if token_logprob.logprob == -9999.0: 318 # Another "bug" - sometimes the logprob is -9999.0. This seems to happen when the rest of the logprobs are tiny probability. 319 total_score += primary_token_score * 1.0 320 total_probability += 1.0 321 else: 322 probability = math.exp(token_logprob.logprob) 323 total_score += primary_token_score * probability 324 total_probability += probability 325 326 if total_probability <= 0.0: 327 raise RuntimeError( 328 f"No valid scoring tokens found for {token_logprob.token}. This should never happen as the token has a valid score (so it must be excluded from top logprobs). Please file a bug if you see this." 329 ) 330 331 # Normalize by total probability of valid tokens (LLM may have wanted to generate other non-rating tokens, these shouldn't lower score of rating tokens) 332 weighted_score = total_score / total_probability 333 334 return weighted_score
Convert a rating token to a score using weighted average of top logprobs.
Only includes tokens that have valid scores.
Some cleanup for upper case, whitespace and quotes. LLMs aren't always consistent.
336 def score_from_token_string(self, token: str) -> float | None: 337 if token in TOKEN_TO_SCORE_MAP: 338 return TOKEN_TO_SCORE_MAP[token] 339 340 # handle more token variations like '"1"' and '"pass"' and ' paSS' and 'PASS' 341 unquoted_token = token.strip().strip('"').lower() 342 if unquoted_token in TOKEN_TO_SCORE_MAP: 343 return TOKEN_TO_SCORE_MAP[unquoted_token] 344 345 # handle numeric tokens like "1.0" 346 try: 347 float_value = float(token) 348 if float_value.is_integer(): 349 str_token = str(int(float_value)) 350 if str_token in TOKEN_TO_SCORE_MAP: 351 return TOKEN_TO_SCORE_MAP[str_token] 352 except ValueError: 353 pass 354 355 return None
357 def metric_offsets(self, raw_output: str, metrics: List[str]) -> Dict[str, int]: 358 """ 359 Find the offset to the start of each metric in the raw output json 360 361 For the example json: `{"overall_rating": 1}` == 1 362 363 should return: 364 { 365 "overall_rating": 1 # it's 1 character into the json string 366 } 367 """ 368 metric_offsets: Dict[str, int] = {} 369 for metric in metrics: 370 # the quoted metric name is expected in the json: `{"overall_rating": 1}` == 1 371 metric_name = f'"{metric}"' 372 373 # we expect it exactly once 374 count = raw_output.count(metric_name) 375 if count != 1: 376 raise ValueError( 377 f"Metric {metric} should appear exactly once in the output. Found {count} times" 378 ) 379 380 offset = raw_output.find(metric_name) 381 if offset == -1: 382 raise ValueError(f"Metric {metric} not found in raw output") 383 metric_offsets[metric] = offset 384 return metric_offsets
Find the offset to the start of each metric in the raw output json
For the example json: {"overall_rating": 1}
== 1
should return: { "overall_rating": 1 # it's 1 character into the json string }