kiln_ai.adapters.eval.g_eval
1import math 2from typing import Dict, List, Tuple 3 4from litellm.types.utils import ChatCompletionTokenLogprob 5 6from kiln_ai.adapters.adapter_registry import adapter_for_task 7from kiln_ai.adapters.eval.base_eval import BaseEval 8from kiln_ai.adapters.ml_model_list import ( 9 default_structured_output_mode_for_model_provider, 10) 11from kiln_ai.adapters.model_adapters.base_adapter import AdapterConfig, RunOutput 12from kiln_ai.adapters.prompt_builders import PromptGenerators 13from kiln_ai.datamodel import Project, Task, TaskRun 14from kiln_ai.datamodel.eval import EvalConfig, EvalConfigType, EvalScores 15from kiln_ai.datamodel.task import RunConfigProperties, StructuredOutputMode 16 17# all the tokens we score for, and their float scores. 18TOKEN_TO_SCORE_MAP: Dict[str, float] = { 19 "1": 1.0, 20 "2": 2.0, 21 "3": 3.0, 22 "4": 4.0, 23 "5": 5.0, 24 "pass": 1.0, 25 "fail": 0.0, 26 "critical": -1.0, 27} 28 29 30class GEvalTask(Task, parent_of={}): 31 """ 32 Kiln task for executing a G-Eval. Can be run on any Kiln adapter which supports logprobs. 33 34 Note G-Eval implements both G-Eval and LLM as Judge as they are very similar. 35 """ 36 37 def __init__(self, eval_config: EvalConfig): 38 tmp_project = Project(name="GEval") 39 40 # Build a simple LLM as Judge system instruction 41 system_instruction = "Your job to evaluate a model's performance on a task. Blocks will be marked with <eval_data> tags.\n" 42 # Optionally add a short task description 43 task_description = eval_config.properties.get("task_description", None) 44 if task_description: 45 system_instruction += f"\nThe task the model was given is as follows:\n<eval_data>\n{task_description}\n</eval_data>\n" 46 47 # Build the COT eval instructions 48 cot_instructions = "First, think step by step about the model's performance following these evaluation steps:\n\n" 49 steps = eval_config.properties.get("eval_steps", []) 50 if not isinstance(steps, list): 51 raise ValueError("eval_steps must be a list.") 52 for i, step in enumerate(steps): 53 cot_instructions += f"{i + 1}) {step}\n" 54 55 eval = eval_config.parent_eval() 56 if not eval: 57 raise ValueError("Eval config must have a parent eval") 58 59 # Build the output schema from the eval's target output scores. 60 # We restrict the LLM's output scoring schema to discrete scores (pass/fail/critical/1-5) - allow_float_scores=False 61 # However, the final scores from the evaluator can be a float (see later logprob calculation, which requires discrete token outputs) 62 output_schema = BaseEval.build_score_schema(eval, allow_float_scores=False) 63 64 super().__init__( 65 name="GEval Task", 66 parent=tmp_project, 67 instruction=system_instruction, 68 thinking_instruction=cot_instructions, 69 output_json_schema=output_schema, 70 ) 71 72 73class GEval(BaseEval): 74 """ 75 A evaluator which implements G-Eval and LLM as Judge. 76 77 G-Eval is a method of evaluating the quality of a model's output. It is a weighted average of the scores of the tokens in the output. The weights are the log probabilities of the tokens in the output. https://arxiv.org/abs/2303.16634 78 79 LLM as Judge is a method of evaluating the quality of a model's output. It simply asks the LLM to score, and uses the returned output (no logprobs needed). Also called direct evaluation. 80 81 @misc{liu2023gevalnlgevaluationusing, 82 title={G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment}, 83 author={Yang Liu and Dan Iter and Yichong Xu and Shuohang Wang and Ruochen Xu and Chenguang Zhu}, 84 year={2023}, 85 eprint={2303.16634}, 86 archivePrefix={arXiv}, 87 primaryClass={cs.CL}, 88 url={https://arxiv.org/abs/2303.16634}, 89 } 90 """ 91 92 def __init__(self, eval_config: EvalConfig, run_config: RunConfigProperties | None): 93 if ( 94 eval_config.config_type != EvalConfigType.g_eval 95 and eval_config.config_type != EvalConfigType.llm_as_judge 96 ): 97 raise ValueError( 98 f"GEval must be initialized with a GEval or LLM as Judge config_type. Got {eval_config.config_type}" 99 ) 100 101 super().__init__(eval_config, run_config) 102 103 self.geval_task = GEvalTask(eval_config) 104 105 def generate_run_description(self, eval_input: str, eval_output: str) -> str: 106 return f"""The model was given the following input for the task: 107<eval_data> 108{eval_input} 109</eval_data> 110 111The model produced the following output for the task: 112<eval_data> 113{eval_output} 114</eval_data> 115""" 116 117 async def run_eval( 118 self, task_run: TaskRun 119 ) -> tuple[EvalScores, Dict[str, str] | None]: 120 """ 121 Run this eval on the given task run. 122 """ 123 124 model_name, provider = self.model_and_provider() 125 126 # Only fetch logprobs for G-Eval 127 # There are at most 5 valid rating tokens per rating type (five_star being largest), so 10 is more than enough to get to the very very unlikely 128 top_logprobs = ( 129 10 if self.eval_config.config_type == EvalConfigType.g_eval else None 130 ) 131 132 # We don't expose setting this manually in the UI, so pull a recommended mode from ml_model_list 133 structured_output_mode = default_structured_output_mode_for_model_provider( 134 model_name, 135 provider, 136 default=StructuredOutputMode.json_schema, 137 # G-eval expects JSON, so don't allow function calling modes 138 disallowed_modes=[ 139 StructuredOutputMode.function_calling, 140 StructuredOutputMode.function_calling_weak, 141 ], 142 ) 143 144 adapter = adapter_for_task( 145 self.geval_task, 146 run_config_properties=RunConfigProperties( 147 model_name=model_name, 148 model_provider_name=provider, 149 # We always use Simple COT for G-Eval and LLM as Judge 150 prompt_id=PromptGenerators.SIMPLE_CHAIN_OF_THOUGHT, 151 structured_output_mode=structured_output_mode, 152 ), 153 base_adapter_config=AdapterConfig( 154 # Don't save this run into the task_runs. It will be saved into an eval_run where it belongs 155 allow_saving=False, 156 top_logprobs=top_logprobs, 157 ), 158 ) 159 160 run_description = self.generate_run_description( 161 task_run.input, task_run.output.output 162 ) 163 164 # We don't need the run, but invoke_returning_run_output() runs validations for us over _run() 165 _, run_output = await adapter.invoke_returning_run_output(run_description) 166 167 if self.eval_config.config_type == EvalConfigType.llm_as_judge: 168 return self.build_llm_as_judge_score( 169 run_output 170 ), run_output.intermediate_outputs 171 else: 172 return self.build_g_eval_score(run_output), run_output.intermediate_outputs 173 174 def build_llm_as_judge_score(self, run_output: RunOutput) -> EvalScores: 175 """ 176 Build the LLM as Judge score for the given run and run output. 177 """ 178 # Convert the output format we asked for (discreet values) to our float scores 179 scores: EvalScores = {} 180 if not isinstance(run_output.output, dict): 181 raise ValueError("LLM as Judge output must be a dictionary") 182 183 for metric, score in run_output.output.items(): 184 token_score = self.score_from_token_string(f"{score}") 185 if token_score is None: 186 raise ValueError( 187 f"No score found for metric: {metric}. The LLM failed to follow the scoring rubric/instructions/schema." 188 ) 189 scores[metric] = token_score 190 return scores 191 192 def build_g_eval_score(self, run_output: RunOutput) -> EvalScores: 193 """ 194 Build the G-Eval score for the given run and run output. 195 196 We create a weighted average of each rating using the logprobs. 197 198 @misc{liu2023gevalnlgevaluationusing, 199 title={G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment}, 200 author={Yang Liu and Dan Iter and Yichong Xu and Shuohang Wang and Ruochen Xu and Chenguang Zhu}, 201 year={2023}, 202 eprint={2303.16634}, 203 archivePrefix={arXiv}, 204 primaryClass={cs.CL}, 205 url={https://arxiv.org/abs/2303.16634}, 206 } 207 """ 208 # We use structured output 209 outputs = run_output.output 210 assert isinstance(outputs, dict) 211 212 # Build raw string output from the logprobs, which is easier to work with than Dict for the next bit 213 raw_output = self.raw_output_from_logprobs(run_output) 214 215 # find the offset the start of each metric in the raw output json 216 metrics: List[str] = list(outputs.keys()) 217 metric_offsets = self.metric_offsets(raw_output, metrics) 218 219 final_scores: EvalScores = {} 220 for metric in metrics: 221 score = self.g_eval_single_metric( 222 run_output, metric, metric_offsets, raw_output 223 ) 224 if score is None: 225 raise ValueError( 226 f"No score found for metric: {metric}. The LLM failed to follow the scoring rubric/instructions/schema." 227 ) 228 final_scores[metric] = score 229 230 return final_scores 231 232 def g_eval_single_metric( 233 self, 234 run_output: RunOutput, 235 metric: str, 236 metric_offsets: Dict[str, int], 237 raw_output: str, 238 ) -> float | None: 239 """ 240 Run the G-Eval for a single metric. 241 242 Scan the logprobs for the metric and return the weighted score of the rating token. 243 """ 244 245 start_offset, end_offset = self.token_search_range( 246 raw_output, metric, metric_offsets 247 ) 248 249 offset = 0 250 251 if ( 252 run_output.output_logprobs is None 253 or run_output.output_logprobs.content is None 254 ): 255 raise RuntimeError( 256 "No logprobs found for output - can not calculate g-eval" 257 ) 258 259 # scan the tokens in the range, looking for the rating token 260 for _, chat_logprob in enumerate(run_output.output_logprobs.content): 261 if offset >= end_offset: 262 break 263 if offset >= start_offset: 264 score = self.rating_token_to_score(chat_logprob) 265 if score is not None: 266 return score 267 offset += len(chat_logprob.token) 268 269 return None 270 271 def raw_output_from_logprobs(self, run_output: RunOutput) -> str: 272 """ 273 Build the raw output string from the logprobs. Generate from logprobs so it's guaranteed to match the logprobs offsets 274 """ 275 if ( 276 run_output.output_logprobs is None 277 or run_output.output_logprobs.content is None 278 ): 279 raise RuntimeError( 280 "No logprobs found for output - can not calculate g-eval" 281 ) 282 283 raw = "" 284 for chat_logprob in run_output.output_logprobs.content: 285 raw += chat_logprob.token 286 return raw 287 288 def token_search_range( 289 self, raw_output: str, metric: str, metric_offsets: Dict[str, int] 290 ) -> Tuple[int, int]: 291 """ 292 Find the start and end offsets of the metric in the raw output. 293 294 Start searching after the end of the target metric json entry ("overall_rating":), and before the start of the next metric ("some_other_score"). 295 """ 296 start_offset = metric_offsets[metric] + len(metric) 297 298 # Find the lowest end offset that is greater than the start offset 299 end_offset = len(raw_output) 300 for v in list(metric_offsets.values()): 301 if v < end_offset and v > start_offset: 302 end_offset = v 303 304 return start_offset, end_offset 305 306 def rating_token_to_score( 307 self, token_logprob: ChatCompletionTokenLogprob 308 ) -> float | None: 309 """ 310 Convert a rating token to a score using weighted average of top logprobs. 311 312 Only includes tokens that have valid scores. 313 314 Some cleanup for upper case, whitespace and quotes. LLMs aren't always consistent. 315 """ 316 primary_token_score = self.score_from_token_string(token_logprob.token) 317 # check this is a real rating token, it could just be the ": ", "," or whitespace 318 if primary_token_score is None: 319 return None 320 321 total_score = 0.0 322 total_probability = 0.0 323 top_logprobs_contains_primary_token = False 324 325 # Process all valid scoring tokens from alternatives 326 for top_logprob in token_logprob.top_logprobs: 327 if top_logprob.token == token_logprob.token: 328 top_logprobs_contains_primary_token = True 329 token_score = self.score_from_token_string(top_logprob.token) 330 if token_score is not None: 331 # Convert logprob to probability 332 probability = math.exp(top_logprob.logprob) 333 total_score += token_score * probability 334 total_probability += probability 335 336 # Weird OpenAI 4o bug - sometimes the primary token is included in the top logprobs, sometimes not. 337 # Add the primary token back in if excluded 338 if not top_logprobs_contains_primary_token: 339 if token_logprob.logprob == -9999.0: 340 # Another "bug" - sometimes the logprob is -9999.0. This seems to happen when the rest of the logprobs are tiny probability. 341 total_score += primary_token_score * 1.0 342 total_probability += 1.0 343 else: 344 probability = math.exp(token_logprob.logprob) 345 total_score += primary_token_score * probability 346 total_probability += probability 347 348 if total_probability <= 0.0: 349 raise RuntimeError( 350 f"No valid scoring tokens found for {token_logprob.token}. This should never happen as the token has a valid score (so it must be excluded from top logprobs). Please file a bug if you see this." 351 ) 352 353 # Normalize by total probability of valid tokens (LLM may have wanted to generate other non-rating tokens, these shouldn't lower score of rating tokens) 354 weighted_score = total_score / total_probability 355 356 return weighted_score 357 358 def score_from_token_string(self, token: str) -> float | None: 359 if token in TOKEN_TO_SCORE_MAP: 360 return TOKEN_TO_SCORE_MAP[token] 361 362 # handle more token variations like '"1"' and '"pass"' and ' paSS' and 'PASS' 363 unquoted_token = token.strip().strip('"').lower() 364 if unquoted_token in TOKEN_TO_SCORE_MAP: 365 return TOKEN_TO_SCORE_MAP[unquoted_token] 366 367 # handle numeric tokens like "1.0" 368 try: 369 float_value = float(token) 370 if float_value.is_integer(): 371 str_token = str(int(float_value)) 372 if str_token in TOKEN_TO_SCORE_MAP: 373 return TOKEN_TO_SCORE_MAP[str_token] 374 except ValueError: 375 pass 376 377 return None 378 379 def metric_offsets(self, raw_output: str, metrics: List[str]) -> Dict[str, int]: 380 """ 381 Find the offset to the start of each metric in the raw output json 382 383 For the example json: `{"overall_rating": 1}` == 1 384 385 should return: 386 { 387 "overall_rating": 1 # it's 1 character into the json string 388 } 389 """ 390 metric_offsets: Dict[str, int] = {} 391 for metric in metrics: 392 # the quoted metric name is expected in the json: `{"overall_rating": 1}` == 1 393 metric_name = f'"{metric}"' 394 395 # we expect it exactly once 396 count = raw_output.count(metric_name) 397 if count != 1: 398 raise ValueError( 399 f"Metric {metric} should appear exactly once in the output. Found {count} times" 400 ) 401 402 offset = raw_output.find(metric_name) 403 if offset == -1: 404 raise ValueError(f"Metric {metric} not found in raw output") 405 metric_offsets[metric] = offset 406 return metric_offsets
31class GEvalTask(Task, parent_of={}): 32 """ 33 Kiln task for executing a G-Eval. Can be run on any Kiln adapter which supports logprobs. 34 35 Note G-Eval implements both G-Eval and LLM as Judge as they are very similar. 36 """ 37 38 def __init__(self, eval_config: EvalConfig): 39 tmp_project = Project(name="GEval") 40 41 # Build a simple LLM as Judge system instruction 42 system_instruction = "Your job to evaluate a model's performance on a task. Blocks will be marked with <eval_data> tags.\n" 43 # Optionally add a short task description 44 task_description = eval_config.properties.get("task_description", None) 45 if task_description: 46 system_instruction += f"\nThe task the model was given is as follows:\n<eval_data>\n{task_description}\n</eval_data>\n" 47 48 # Build the COT eval instructions 49 cot_instructions = "First, think step by step about the model's performance following these evaluation steps:\n\n" 50 steps = eval_config.properties.get("eval_steps", []) 51 if not isinstance(steps, list): 52 raise ValueError("eval_steps must be a list.") 53 for i, step in enumerate(steps): 54 cot_instructions += f"{i + 1}) {step}\n" 55 56 eval = eval_config.parent_eval() 57 if not eval: 58 raise ValueError("Eval config must have a parent eval") 59 60 # Build the output schema from the eval's target output scores. 61 # We restrict the LLM's output scoring schema to discrete scores (pass/fail/critical/1-5) - allow_float_scores=False 62 # However, the final scores from the evaluator can be a float (see later logprob calculation, which requires discrete token outputs) 63 output_schema = BaseEval.build_score_schema(eval, allow_float_scores=False) 64 65 super().__init__( 66 name="GEval Task", 67 parent=tmp_project, 68 instruction=system_instruction, 69 thinking_instruction=cot_instructions, 70 output_json_schema=output_schema, 71 )
Kiln task for executing a G-Eval. Can be run on any Kiln adapter which supports logprobs.
Note G-Eval implements both G-Eval and LLM as Judge as they are very similar.
38 def __init__(self, eval_config: EvalConfig): 39 tmp_project = Project(name="GEval") 40 41 # Build a simple LLM as Judge system instruction 42 system_instruction = "Your job to evaluate a model's performance on a task. Blocks will be marked with <eval_data> tags.\n" 43 # Optionally add a short task description 44 task_description = eval_config.properties.get("task_description", None) 45 if task_description: 46 system_instruction += f"\nThe task the model was given is as follows:\n<eval_data>\n{task_description}\n</eval_data>\n" 47 48 # Build the COT eval instructions 49 cot_instructions = "First, think step by step about the model's performance following these evaluation steps:\n\n" 50 steps = eval_config.properties.get("eval_steps", []) 51 if not isinstance(steps, list): 52 raise ValueError("eval_steps must be a list.") 53 for i, step in enumerate(steps): 54 cot_instructions += f"{i + 1}) {step}\n" 55 56 eval = eval_config.parent_eval() 57 if not eval: 58 raise ValueError("Eval config must have a parent eval") 59 60 # Build the output schema from the eval's target output scores. 61 # We restrict the LLM's output scoring schema to discrete scores (pass/fail/critical/1-5) - allow_float_scores=False 62 # However, the final scores from the evaluator can be a float (see later logprob calculation, which requires discrete token outputs) 63 output_schema = BaseEval.build_score_schema(eval, allow_float_scores=False) 64 65 super().__init__( 66 name="GEval Task", 67 parent=tmp_project, 68 instruction=system_instruction, 69 thinking_instruction=cot_instructions, 70 output_json_schema=output_schema, 71 )
Create a new model by parsing and validating input data from keyword arguments.
Raises [ValidationError
][pydantic_core.ValidationError] if the input data cannot be
validated to form a valid model.
self
is explicitly positional-only to allow self
as a field name.
Configuration for the model, should be a dictionary conforming to [ConfigDict
][pydantic.config.ConfigDict].
122 def wrapped_model_post_init(self: BaseModel, context: Any, /) -> None: 123 """We need to both initialize private attributes and call the user-defined model_post_init 124 method. 125 """ 126 init_private_attributes(self, context) 127 original_model_post_init(self, context)
We need to both initialize private attributes and call the user-defined model_post_init method.
74class GEval(BaseEval): 75 """ 76 A evaluator which implements G-Eval and LLM as Judge. 77 78 G-Eval is a method of evaluating the quality of a model's output. It is a weighted average of the scores of the tokens in the output. The weights are the log probabilities of the tokens in the output. https://arxiv.org/abs/2303.16634 79 80 LLM as Judge is a method of evaluating the quality of a model's output. It simply asks the LLM to score, and uses the returned output (no logprobs needed). Also called direct evaluation. 81 82 @misc{liu2023gevalnlgevaluationusing, 83 title={G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment}, 84 author={Yang Liu and Dan Iter and Yichong Xu and Shuohang Wang and Ruochen Xu and Chenguang Zhu}, 85 year={2023}, 86 eprint={2303.16634}, 87 archivePrefix={arXiv}, 88 primaryClass={cs.CL}, 89 url={https://arxiv.org/abs/2303.16634}, 90 } 91 """ 92 93 def __init__(self, eval_config: EvalConfig, run_config: RunConfigProperties | None): 94 if ( 95 eval_config.config_type != EvalConfigType.g_eval 96 and eval_config.config_type != EvalConfigType.llm_as_judge 97 ): 98 raise ValueError( 99 f"GEval must be initialized with a GEval or LLM as Judge config_type. Got {eval_config.config_type}" 100 ) 101 102 super().__init__(eval_config, run_config) 103 104 self.geval_task = GEvalTask(eval_config) 105 106 def generate_run_description(self, eval_input: str, eval_output: str) -> str: 107 return f"""The model was given the following input for the task: 108<eval_data> 109{eval_input} 110</eval_data> 111 112The model produced the following output for the task: 113<eval_data> 114{eval_output} 115</eval_data> 116""" 117 118 async def run_eval( 119 self, task_run: TaskRun 120 ) -> tuple[EvalScores, Dict[str, str] | None]: 121 """ 122 Run this eval on the given task run. 123 """ 124 125 model_name, provider = self.model_and_provider() 126 127 # Only fetch logprobs for G-Eval 128 # There are at most 5 valid rating tokens per rating type (five_star being largest), so 10 is more than enough to get to the very very unlikely 129 top_logprobs = ( 130 10 if self.eval_config.config_type == EvalConfigType.g_eval else None 131 ) 132 133 # We don't expose setting this manually in the UI, so pull a recommended mode from ml_model_list 134 structured_output_mode = default_structured_output_mode_for_model_provider( 135 model_name, 136 provider, 137 default=StructuredOutputMode.json_schema, 138 # G-eval expects JSON, so don't allow function calling modes 139 disallowed_modes=[ 140 StructuredOutputMode.function_calling, 141 StructuredOutputMode.function_calling_weak, 142 ], 143 ) 144 145 adapter = adapter_for_task( 146 self.geval_task, 147 run_config_properties=RunConfigProperties( 148 model_name=model_name, 149 model_provider_name=provider, 150 # We always use Simple COT for G-Eval and LLM as Judge 151 prompt_id=PromptGenerators.SIMPLE_CHAIN_OF_THOUGHT, 152 structured_output_mode=structured_output_mode, 153 ), 154 base_adapter_config=AdapterConfig( 155 # Don't save this run into the task_runs. It will be saved into an eval_run where it belongs 156 allow_saving=False, 157 top_logprobs=top_logprobs, 158 ), 159 ) 160 161 run_description = self.generate_run_description( 162 task_run.input, task_run.output.output 163 ) 164 165 # We don't need the run, but invoke_returning_run_output() runs validations for us over _run() 166 _, run_output = await adapter.invoke_returning_run_output(run_description) 167 168 if self.eval_config.config_type == EvalConfigType.llm_as_judge: 169 return self.build_llm_as_judge_score( 170 run_output 171 ), run_output.intermediate_outputs 172 else: 173 return self.build_g_eval_score(run_output), run_output.intermediate_outputs 174 175 def build_llm_as_judge_score(self, run_output: RunOutput) -> EvalScores: 176 """ 177 Build the LLM as Judge score for the given run and run output. 178 """ 179 # Convert the output format we asked for (discreet values) to our float scores 180 scores: EvalScores = {} 181 if not isinstance(run_output.output, dict): 182 raise ValueError("LLM as Judge output must be a dictionary") 183 184 for metric, score in run_output.output.items(): 185 token_score = self.score_from_token_string(f"{score}") 186 if token_score is None: 187 raise ValueError( 188 f"No score found for metric: {metric}. The LLM failed to follow the scoring rubric/instructions/schema." 189 ) 190 scores[metric] = token_score 191 return scores 192 193 def build_g_eval_score(self, run_output: RunOutput) -> EvalScores: 194 """ 195 Build the G-Eval score for the given run and run output. 196 197 We create a weighted average of each rating using the logprobs. 198 199 @misc{liu2023gevalnlgevaluationusing, 200 title={G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment}, 201 author={Yang Liu and Dan Iter and Yichong Xu and Shuohang Wang and Ruochen Xu and Chenguang Zhu}, 202 year={2023}, 203 eprint={2303.16634}, 204 archivePrefix={arXiv}, 205 primaryClass={cs.CL}, 206 url={https://arxiv.org/abs/2303.16634}, 207 } 208 """ 209 # We use structured output 210 outputs = run_output.output 211 assert isinstance(outputs, dict) 212 213 # Build raw string output from the logprobs, which is easier to work with than Dict for the next bit 214 raw_output = self.raw_output_from_logprobs(run_output) 215 216 # find the offset the start of each metric in the raw output json 217 metrics: List[str] = list(outputs.keys()) 218 metric_offsets = self.metric_offsets(raw_output, metrics) 219 220 final_scores: EvalScores = {} 221 for metric in metrics: 222 score = self.g_eval_single_metric( 223 run_output, metric, metric_offsets, raw_output 224 ) 225 if score is None: 226 raise ValueError( 227 f"No score found for metric: {metric}. The LLM failed to follow the scoring rubric/instructions/schema." 228 ) 229 final_scores[metric] = score 230 231 return final_scores 232 233 def g_eval_single_metric( 234 self, 235 run_output: RunOutput, 236 metric: str, 237 metric_offsets: Dict[str, int], 238 raw_output: str, 239 ) -> float | None: 240 """ 241 Run the G-Eval for a single metric. 242 243 Scan the logprobs for the metric and return the weighted score of the rating token. 244 """ 245 246 start_offset, end_offset = self.token_search_range( 247 raw_output, metric, metric_offsets 248 ) 249 250 offset = 0 251 252 if ( 253 run_output.output_logprobs is None 254 or run_output.output_logprobs.content is None 255 ): 256 raise RuntimeError( 257 "No logprobs found for output - can not calculate g-eval" 258 ) 259 260 # scan the tokens in the range, looking for the rating token 261 for _, chat_logprob in enumerate(run_output.output_logprobs.content): 262 if offset >= end_offset: 263 break 264 if offset >= start_offset: 265 score = self.rating_token_to_score(chat_logprob) 266 if score is not None: 267 return score 268 offset += len(chat_logprob.token) 269 270 return None 271 272 def raw_output_from_logprobs(self, run_output: RunOutput) -> str: 273 """ 274 Build the raw output string from the logprobs. Generate from logprobs so it's guaranteed to match the logprobs offsets 275 """ 276 if ( 277 run_output.output_logprobs is None 278 or run_output.output_logprobs.content is None 279 ): 280 raise RuntimeError( 281 "No logprobs found for output - can not calculate g-eval" 282 ) 283 284 raw = "" 285 for chat_logprob in run_output.output_logprobs.content: 286 raw += chat_logprob.token 287 return raw 288 289 def token_search_range( 290 self, raw_output: str, metric: str, metric_offsets: Dict[str, int] 291 ) -> Tuple[int, int]: 292 """ 293 Find the start and end offsets of the metric in the raw output. 294 295 Start searching after the end of the target metric json entry ("overall_rating":), and before the start of the next metric ("some_other_score"). 296 """ 297 start_offset = metric_offsets[metric] + len(metric) 298 299 # Find the lowest end offset that is greater than the start offset 300 end_offset = len(raw_output) 301 for v in list(metric_offsets.values()): 302 if v < end_offset and v > start_offset: 303 end_offset = v 304 305 return start_offset, end_offset 306 307 def rating_token_to_score( 308 self, token_logprob: ChatCompletionTokenLogprob 309 ) -> float | None: 310 """ 311 Convert a rating token to a score using weighted average of top logprobs. 312 313 Only includes tokens that have valid scores. 314 315 Some cleanup for upper case, whitespace and quotes. LLMs aren't always consistent. 316 """ 317 primary_token_score = self.score_from_token_string(token_logprob.token) 318 # check this is a real rating token, it could just be the ": ", "," or whitespace 319 if primary_token_score is None: 320 return None 321 322 total_score = 0.0 323 total_probability = 0.0 324 top_logprobs_contains_primary_token = False 325 326 # Process all valid scoring tokens from alternatives 327 for top_logprob in token_logprob.top_logprobs: 328 if top_logprob.token == token_logprob.token: 329 top_logprobs_contains_primary_token = True 330 token_score = self.score_from_token_string(top_logprob.token) 331 if token_score is not None: 332 # Convert logprob to probability 333 probability = math.exp(top_logprob.logprob) 334 total_score += token_score * probability 335 total_probability += probability 336 337 # Weird OpenAI 4o bug - sometimes the primary token is included in the top logprobs, sometimes not. 338 # Add the primary token back in if excluded 339 if not top_logprobs_contains_primary_token: 340 if token_logprob.logprob == -9999.0: 341 # Another "bug" - sometimes the logprob is -9999.0. This seems to happen when the rest of the logprobs are tiny probability. 342 total_score += primary_token_score * 1.0 343 total_probability += 1.0 344 else: 345 probability = math.exp(token_logprob.logprob) 346 total_score += primary_token_score * probability 347 total_probability += probability 348 349 if total_probability <= 0.0: 350 raise RuntimeError( 351 f"No valid scoring tokens found for {token_logprob.token}. This should never happen as the token has a valid score (so it must be excluded from top logprobs). Please file a bug if you see this." 352 ) 353 354 # Normalize by total probability of valid tokens (LLM may have wanted to generate other non-rating tokens, these shouldn't lower score of rating tokens) 355 weighted_score = total_score / total_probability 356 357 return weighted_score 358 359 def score_from_token_string(self, token: str) -> float | None: 360 if token in TOKEN_TO_SCORE_MAP: 361 return TOKEN_TO_SCORE_MAP[token] 362 363 # handle more token variations like '"1"' and '"pass"' and ' paSS' and 'PASS' 364 unquoted_token = token.strip().strip('"').lower() 365 if unquoted_token in TOKEN_TO_SCORE_MAP: 366 return TOKEN_TO_SCORE_MAP[unquoted_token] 367 368 # handle numeric tokens like "1.0" 369 try: 370 float_value = float(token) 371 if float_value.is_integer(): 372 str_token = str(int(float_value)) 373 if str_token in TOKEN_TO_SCORE_MAP: 374 return TOKEN_TO_SCORE_MAP[str_token] 375 except ValueError: 376 pass 377 378 return None 379 380 def metric_offsets(self, raw_output: str, metrics: List[str]) -> Dict[str, int]: 381 """ 382 Find the offset to the start of each metric in the raw output json 383 384 For the example json: `{"overall_rating": 1}` == 1 385 386 should return: 387 { 388 "overall_rating": 1 # it's 1 character into the json string 389 } 390 """ 391 metric_offsets: Dict[str, int] = {} 392 for metric in metrics: 393 # the quoted metric name is expected in the json: `{"overall_rating": 1}` == 1 394 metric_name = f'"{metric}"' 395 396 # we expect it exactly once 397 count = raw_output.count(metric_name) 398 if count != 1: 399 raise ValueError( 400 f"Metric {metric} should appear exactly once in the output. Found {count} times" 401 ) 402 403 offset = raw_output.find(metric_name) 404 if offset == -1: 405 raise ValueError(f"Metric {metric} not found in raw output") 406 metric_offsets[metric] = offset 407 return metric_offsets
A evaluator which implements G-Eval and LLM as Judge.
G-Eval is a method of evaluating the quality of a model's output. It is a weighted average of the scores of the tokens in the output. The weights are the log probabilities of the tokens in the output. https://arxiv.org/abs/2303.16634
LLM as Judge is a method of evaluating the quality of a model's output. It simply asks the LLM to score, and uses the returned output (no logprobs needed). Also called direct evaluation.
@misc{liu2023gevalnlgevaluationusing, title={G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment}, author={Yang Liu and Dan Iter and Yichong Xu and Shuohang Wang and Ruochen Xu and Chenguang Zhu}, year={2023}, eprint={2303.16634}, archivePrefix={arXiv}, primaryClass={cs.CL}, url={https://arxiv.org/abs/2303.16634}, }
93 def __init__(self, eval_config: EvalConfig, run_config: RunConfigProperties | None): 94 if ( 95 eval_config.config_type != EvalConfigType.g_eval 96 and eval_config.config_type != EvalConfigType.llm_as_judge 97 ): 98 raise ValueError( 99 f"GEval must be initialized with a GEval or LLM as Judge config_type. Got {eval_config.config_type}" 100 ) 101 102 super().__init__(eval_config, run_config) 103 104 self.geval_task = GEvalTask(eval_config)
106 def generate_run_description(self, eval_input: str, eval_output: str) -> str: 107 return f"""The model was given the following input for the task: 108<eval_data> 109{eval_input} 110</eval_data> 111 112The model produced the following output for the task: 113<eval_data> 114{eval_output} 115</eval_data> 116"""
118 async def run_eval( 119 self, task_run: TaskRun 120 ) -> tuple[EvalScores, Dict[str, str] | None]: 121 """ 122 Run this eval on the given task run. 123 """ 124 125 model_name, provider = self.model_and_provider() 126 127 # Only fetch logprobs for G-Eval 128 # There are at most 5 valid rating tokens per rating type (five_star being largest), so 10 is more than enough to get to the very very unlikely 129 top_logprobs = ( 130 10 if self.eval_config.config_type == EvalConfigType.g_eval else None 131 ) 132 133 # We don't expose setting this manually in the UI, so pull a recommended mode from ml_model_list 134 structured_output_mode = default_structured_output_mode_for_model_provider( 135 model_name, 136 provider, 137 default=StructuredOutputMode.json_schema, 138 # G-eval expects JSON, so don't allow function calling modes 139 disallowed_modes=[ 140 StructuredOutputMode.function_calling, 141 StructuredOutputMode.function_calling_weak, 142 ], 143 ) 144 145 adapter = adapter_for_task( 146 self.geval_task, 147 run_config_properties=RunConfigProperties( 148 model_name=model_name, 149 model_provider_name=provider, 150 # We always use Simple COT for G-Eval and LLM as Judge 151 prompt_id=PromptGenerators.SIMPLE_CHAIN_OF_THOUGHT, 152 structured_output_mode=structured_output_mode, 153 ), 154 base_adapter_config=AdapterConfig( 155 # Don't save this run into the task_runs. It will be saved into an eval_run where it belongs 156 allow_saving=False, 157 top_logprobs=top_logprobs, 158 ), 159 ) 160 161 run_description = self.generate_run_description( 162 task_run.input, task_run.output.output 163 ) 164 165 # We don't need the run, but invoke_returning_run_output() runs validations for us over _run() 166 _, run_output = await adapter.invoke_returning_run_output(run_description) 167 168 if self.eval_config.config_type == EvalConfigType.llm_as_judge: 169 return self.build_llm_as_judge_score( 170 run_output 171 ), run_output.intermediate_outputs 172 else: 173 return self.build_g_eval_score(run_output), run_output.intermediate_outputs
Run this eval on the given task run.
175 def build_llm_as_judge_score(self, run_output: RunOutput) -> EvalScores: 176 """ 177 Build the LLM as Judge score for the given run and run output. 178 """ 179 # Convert the output format we asked for (discreet values) to our float scores 180 scores: EvalScores = {} 181 if not isinstance(run_output.output, dict): 182 raise ValueError("LLM as Judge output must be a dictionary") 183 184 for metric, score in run_output.output.items(): 185 token_score = self.score_from_token_string(f"{score}") 186 if token_score is None: 187 raise ValueError( 188 f"No score found for metric: {metric}. The LLM failed to follow the scoring rubric/instructions/schema." 189 ) 190 scores[metric] = token_score 191 return scores
Build the LLM as Judge score for the given run and run output.
193 def build_g_eval_score(self, run_output: RunOutput) -> EvalScores: 194 """ 195 Build the G-Eval score for the given run and run output. 196 197 We create a weighted average of each rating using the logprobs. 198 199 @misc{liu2023gevalnlgevaluationusing, 200 title={G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment}, 201 author={Yang Liu and Dan Iter and Yichong Xu and Shuohang Wang and Ruochen Xu and Chenguang Zhu}, 202 year={2023}, 203 eprint={2303.16634}, 204 archivePrefix={arXiv}, 205 primaryClass={cs.CL}, 206 url={https://arxiv.org/abs/2303.16634}, 207 } 208 """ 209 # We use structured output 210 outputs = run_output.output 211 assert isinstance(outputs, dict) 212 213 # Build raw string output from the logprobs, which is easier to work with than Dict for the next bit 214 raw_output = self.raw_output_from_logprobs(run_output) 215 216 # find the offset the start of each metric in the raw output json 217 metrics: List[str] = list(outputs.keys()) 218 metric_offsets = self.metric_offsets(raw_output, metrics) 219 220 final_scores: EvalScores = {} 221 for metric in metrics: 222 score = self.g_eval_single_metric( 223 run_output, metric, metric_offsets, raw_output 224 ) 225 if score is None: 226 raise ValueError( 227 f"No score found for metric: {metric}. The LLM failed to follow the scoring rubric/instructions/schema." 228 ) 229 final_scores[metric] = score 230 231 return final_scores
Build the G-Eval score for the given run and run output.
We create a weighted average of each rating using the logprobs.
@misc{liu2023gevalnlgevaluationusing, title={G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment}, author={Yang Liu and Dan Iter and Yichong Xu and Shuohang Wang and Ruochen Xu and Chenguang Zhu}, year={2023}, eprint={2303.16634}, archivePrefix={arXiv}, primaryClass={cs.CL}, url={https://arxiv.org/abs/2303.16634}, }
233 def g_eval_single_metric( 234 self, 235 run_output: RunOutput, 236 metric: str, 237 metric_offsets: Dict[str, int], 238 raw_output: str, 239 ) -> float | None: 240 """ 241 Run the G-Eval for a single metric. 242 243 Scan the logprobs for the metric and return the weighted score of the rating token. 244 """ 245 246 start_offset, end_offset = self.token_search_range( 247 raw_output, metric, metric_offsets 248 ) 249 250 offset = 0 251 252 if ( 253 run_output.output_logprobs is None 254 or run_output.output_logprobs.content is None 255 ): 256 raise RuntimeError( 257 "No logprobs found for output - can not calculate g-eval" 258 ) 259 260 # scan the tokens in the range, looking for the rating token 261 for _, chat_logprob in enumerate(run_output.output_logprobs.content): 262 if offset >= end_offset: 263 break 264 if offset >= start_offset: 265 score = self.rating_token_to_score(chat_logprob) 266 if score is not None: 267 return score 268 offset += len(chat_logprob.token) 269 270 return None
Run the G-Eval for a single metric.
Scan the logprobs for the metric and return the weighted score of the rating token.
272 def raw_output_from_logprobs(self, run_output: RunOutput) -> str: 273 """ 274 Build the raw output string from the logprobs. Generate from logprobs so it's guaranteed to match the logprobs offsets 275 """ 276 if ( 277 run_output.output_logprobs is None 278 or run_output.output_logprobs.content is None 279 ): 280 raise RuntimeError( 281 "No logprobs found for output - can not calculate g-eval" 282 ) 283 284 raw = "" 285 for chat_logprob in run_output.output_logprobs.content: 286 raw += chat_logprob.token 287 return raw
Build the raw output string from the logprobs. Generate from logprobs so it's guaranteed to match the logprobs offsets
289 def token_search_range( 290 self, raw_output: str, metric: str, metric_offsets: Dict[str, int] 291 ) -> Tuple[int, int]: 292 """ 293 Find the start and end offsets of the metric in the raw output. 294 295 Start searching after the end of the target metric json entry ("overall_rating":), and before the start of the next metric ("some_other_score"). 296 """ 297 start_offset = metric_offsets[metric] + len(metric) 298 299 # Find the lowest end offset that is greater than the start offset 300 end_offset = len(raw_output) 301 for v in list(metric_offsets.values()): 302 if v < end_offset and v > start_offset: 303 end_offset = v 304 305 return start_offset, end_offset
Find the start and end offsets of the metric in the raw output.
Start searching after the end of the target metric json entry ("overall_rating":), and before the start of the next metric ("some_other_score").
307 def rating_token_to_score( 308 self, token_logprob: ChatCompletionTokenLogprob 309 ) -> float | None: 310 """ 311 Convert a rating token to a score using weighted average of top logprobs. 312 313 Only includes tokens that have valid scores. 314 315 Some cleanup for upper case, whitespace and quotes. LLMs aren't always consistent. 316 """ 317 primary_token_score = self.score_from_token_string(token_logprob.token) 318 # check this is a real rating token, it could just be the ": ", "," or whitespace 319 if primary_token_score is None: 320 return None 321 322 total_score = 0.0 323 total_probability = 0.0 324 top_logprobs_contains_primary_token = False 325 326 # Process all valid scoring tokens from alternatives 327 for top_logprob in token_logprob.top_logprobs: 328 if top_logprob.token == token_logprob.token: 329 top_logprobs_contains_primary_token = True 330 token_score = self.score_from_token_string(top_logprob.token) 331 if token_score is not None: 332 # Convert logprob to probability 333 probability = math.exp(top_logprob.logprob) 334 total_score += token_score * probability 335 total_probability += probability 336 337 # Weird OpenAI 4o bug - sometimes the primary token is included in the top logprobs, sometimes not. 338 # Add the primary token back in if excluded 339 if not top_logprobs_contains_primary_token: 340 if token_logprob.logprob == -9999.0: 341 # Another "bug" - sometimes the logprob is -9999.0. This seems to happen when the rest of the logprobs are tiny probability. 342 total_score += primary_token_score * 1.0 343 total_probability += 1.0 344 else: 345 probability = math.exp(token_logprob.logprob) 346 total_score += primary_token_score * probability 347 total_probability += probability 348 349 if total_probability <= 0.0: 350 raise RuntimeError( 351 f"No valid scoring tokens found for {token_logprob.token}. This should never happen as the token has a valid score (so it must be excluded from top logprobs). Please file a bug if you see this." 352 ) 353 354 # Normalize by total probability of valid tokens (LLM may have wanted to generate other non-rating tokens, these shouldn't lower score of rating tokens) 355 weighted_score = total_score / total_probability 356 357 return weighted_score
Convert a rating token to a score using weighted average of top logprobs.
Only includes tokens that have valid scores.
Some cleanup for upper case, whitespace and quotes. LLMs aren't always consistent.
359 def score_from_token_string(self, token: str) -> float | None: 360 if token in TOKEN_TO_SCORE_MAP: 361 return TOKEN_TO_SCORE_MAP[token] 362 363 # handle more token variations like '"1"' and '"pass"' and ' paSS' and 'PASS' 364 unquoted_token = token.strip().strip('"').lower() 365 if unquoted_token in TOKEN_TO_SCORE_MAP: 366 return TOKEN_TO_SCORE_MAP[unquoted_token] 367 368 # handle numeric tokens like "1.0" 369 try: 370 float_value = float(token) 371 if float_value.is_integer(): 372 str_token = str(int(float_value)) 373 if str_token in TOKEN_TO_SCORE_MAP: 374 return TOKEN_TO_SCORE_MAP[str_token] 375 except ValueError: 376 pass 377 378 return None
380 def metric_offsets(self, raw_output: str, metrics: List[str]) -> Dict[str, int]: 381 """ 382 Find the offset to the start of each metric in the raw output json 383 384 For the example json: `{"overall_rating": 1}` == 1 385 386 should return: 387 { 388 "overall_rating": 1 # it's 1 character into the json string 389 } 390 """ 391 metric_offsets: Dict[str, int] = {} 392 for metric in metrics: 393 # the quoted metric name is expected in the json: `{"overall_rating": 1}` == 1 394 metric_name = f'"{metric}"' 395 396 # we expect it exactly once 397 count = raw_output.count(metric_name) 398 if count != 1: 399 raise ValueError( 400 f"Metric {metric} should appear exactly once in the output. Found {count} times" 401 ) 402 403 offset = raw_output.find(metric_name) 404 if offset == -1: 405 raise ValueError(f"Metric {metric} not found in raw output") 406 metric_offsets[metric] = offset 407 return metric_offsets
Find the offset to the start of each metric in the raw output json
For the example json: {"overall_rating": 1}
== 1
should return: { "overall_rating": 1 # it's 1 character into the json string }