Source code for tau_eval.utils

import copy
from typing import Callable

import numpy as np
import tasknet as tn
from datasets import Dataset

from tau_eval.metrics.bertscore import compute_bertscore
from tau_eval.metrics.cola import compute_cola, load_cola
from tau_eval.metrics.luar import compute_luar, load_luar
from tau_eval.metrics.meteor import compute_meteor
from tau_eval.metrics.nli import compute_nli, load_nli
from tau_eval.metrics.perplexity import compute_perplexity
from tau_eval.metrics.rouge import compute_rouge
from tau_eval.metrics.sbert import compute_sbert, load_sbert
from tau_eval.tasks.customtask import CustomTask

from .logger import logger


# Type for metric functions
MetricFunction = Callable[[str | list[str], str | list[str]], dict[str, float]]

VALID_METRICS = [
    "bertscore",
    "cola",
    "luar",
    "meteor",
    "nli",
    "perplexity",
    "rouge",
    "sbert",
]
DEFAULT_METRICS = ["rouge", "meteor", "luar"]


[docs] def evaluate_system_output( inputs: list[str], outputs: list[str], metrics: list[str | MetricFunction] = DEFAULT_METRICS, ) -> dict: r""" Evaluate a system output with automatic metrics """ for metric in metrics: if isinstance(metric, str): assert metric in VALID_METRICS, f'"{metric}" is not a valid metric. Choose among: {VALID_METRICS}' elif not callable(metric): raise TypeError(f"Metric must be a string or callable, got {type(metric)}") # Compute each metric metric_scores = {} if "bertscore" in metrics: scores = compute_bertscore(inputs, outputs) metric_scores["bertscore_precision"] = np.mean(scores["precision"]) metric_scores["bertscore_recall"] = np.mean(scores["recall"]) metric_scores["bertscore_f1"] = np.mean(scores["f1"]) if "cola" in metrics: cola_tokenizer, cola_model = load_cola() scores = compute_cola(outputs, cola_tokenizer, cola_model) metric_scores["cola"] = np.mean(scores["cola"]) if "luar" in metrics: sim_model = load_luar() scores = compute_luar(inputs, outputs, sim_model) metric_scores["luar"] = np.mean(scores["luar"]) if "meteor" in metrics: scores = compute_meteor(inputs, outputs) metric_scores["meteor"] = np.mean(scores["meteor"]) if "nli" in metrics: nli_tokenizer, nli_model = load_nli() scores = compute_nli(inputs, outputs, nli_tokenizer, nli_model) metric_scores["nli"] = np.mean(scores["entailment"]) if "perplexity" in metrics: scores = compute_perplexity(outputs) metric_scores["perplexity"] = np.mean(scores["perplexities"]) if "rouge" in metrics: scores = compute_rouge(inputs, outputs) metric_scores["rouge1"] = np.mean(scores["rouge1"]) metric_scores["rouge2"] = np.mean(scores["rouge2"]) metric_scores["rougeL"] = np.mean(scores["rougeL"]) if "sbert" in metrics: sim_model = load_sbert() scores = compute_sbert(inputs, outputs, sim_model) metric_scores["sbert"] = np.mean(scores["sbert"]) # Handle custom metric functions for func in list(filter(lambda x: callable(x), metrics)): custom_scores = func(inputs, outputs) metric_scores.update(custom_scores) return metric_scores
[docs] def run_models_on_task( models, task, metrics, classifier_name="answerdotai/ModernBERT-base", do_train=False, do_train_adversarial=False, device="cuda", export_generated_texts=True, ): ori_task = copy.deepcopy(task) args = {"model_name": classifier_name, "evaluation_strategy": None} results = {"metrics": metrics} m = tn.Model([ori_task], args) trainer = tn.Trainer(m, [ori_task], args) if do_train: trainer.train() ori_summary = trainer.evaluate(metric_key_prefix="test") results["original_metrics"] = ori_summary[0] if "cola" in metrics: cola_tokenizer, cola_model = load_cola() scores_input = compute_cola(ori_task.dataset["test"][ori_task.s1], cola_tokenizer, cola_model) results["original_metrics"]["cola"] = np.mean(scores_input["cola"]) if export_generated_texts: tests = {"original": ori_task.dataset["test"][ori_task.s1]} for i, model in enumerate(models): logger.debug(f"Evaluating model {i}") new_task = copy.deepcopy(task) if hasattr(model, "anonymize_batch"): def rewrite_batch(batch): batch[new_task.s1] = model.anonymize_batch(batch[new_task.s1]) if new_task.s2 != "" and new_task.s2 in batch.keys(): batch[new_task.s2] = model.anonymize_batch(batch[new_task.s2]) return batch if do_train_adversarial: new_task.dataset["train"] = new_task.dataset["train"].map( rewrite_batch, batched=True, batch_size=64, features=new_task.dataset["train"].features, ) new_task.dataset["validation"] = new_task.dataset["validation"].map( rewrite_batch, batched=True, batch_size=64, features=new_task.dataset["validation"].features, ) new_task.dataset["test"] = new_task.dataset["test"].map( rewrite_batch, batched=True, batch_size=64, features=new_task.dataset["test"].features, ) else: def rewrite(example): example[new_task.s1] = model.anonymize(example[new_task.s1]) # example[new_task.s1] = "...." if new_task.s2 != "" and new_task.s2 in example.keys(): example[new_task.s2] = model.anonymize(example[new_task.s2]) return example if do_train_adversarial: new_task.dataset["train"] = new_task.dataset["train"].map( rewrite_batch, batched=True, batch_size=64, features=new_task.dataset["train"].features, ) new_task.dataset["validation"] = new_task.dataset["validation"].map( rewrite_batch, batched=True, batch_size=64, features=new_task.dataset["validation"].features, ) new_task.dataset["test"] = new_task.dataset["test"].map( rewrite, features=new_task.dataset["test"].features ) metric_scores = evaluate_system_output( ori_task.dataset["test"][ori_task.s1], new_task.dataset["test"][new_task.s1], metrics, ) results["metrics"] = list(metric_scores.keys()) new_trainer = tn.Trainer(trainer.model, [new_task], args) trainer.test_dataset = new_trainer.test_dataset if do_train_adversarial: trainer.train_dataset = new_trainer.train_dataset trainer.eval_dataset = new_trainer.eval_dataset new_summary = trainer.evaluate(metric_key_prefix="test") metric_scores = metric_scores | new_summary[0] if hasattr(model, "name") and hasattr(task, "name"): results[f"{model.name}"] = metric_scores if export_generated_texts: tests[f"{model.name}"] = new_task.dataset["test"][new_task.s1] else: results[f"{model.__class__.__name__}"] = metric_scores if export_generated_texts: tests[f"{model.__class__.__name__}"] = new_task.dataset["test"][new_task.s1] if export_generated_texts: ds = Dataset.from_dict(tests) try: ds.save_to_disk(f"{task.name}.hf") except Exception: ds.save_to_disk(f"{task.__class__.__name__}.hf") logger.debug("Saved generated dataset") return results
[docs] def run_models_on_custom_task(models, task: CustomTask, metrics): ori_task = copy.deepcopy(task) results = {"metrics": metrics} ori_summary = ori_task.evaluate(ori_task.dataset[ori_task.s1]) results["original_metrics"] = ori_summary if "cola" in metrics: cola_model, cola_tokenizer = load_cola() scores_input = compute_cola(ori_task.dataset[ori_task.s1], cola_tokenizer, cola_model) results["original_metrics"]["cola"] = np.mean(scores_input["cola"]) for i, model in enumerate(models): logger.debug(f"Evaluating model {i}") new_task = copy.deepcopy(task) if hasattr(model, "anonymize_batch"): def rewrite_batch(batch): batch[new_task.s1] = model.anonymize_batch(batch[new_task.s1]) return batch new_task.dataset = new_task.dataset.map( rewrite_batch, batched=True, batch_size=64, features=new_task.dataset.features, ) else: def rewrite(example): example[new_task.s1] = model.anonymize(example[new_task.s1]) # example[new_task.s1] = "...." return example new_task.dataset = new_task.dataset.map(rewrite, features=new_task.dataset.features) metric_scores = evaluate_system_output(ori_task.dataset[ori_task.s1], new_task.dataset[new_task.s1], metrics) results["metrics"] = list(metric_scores.keys()) new_summary = ori_task.evaluate(new_task.dataset[new_task.s1]) metric_scores = metric_scores | new_summary results["metrics"] += list(new_summary.keys()) if hasattr(model, "name") and hasattr(task, "name"): results[f"{model.name}"] = metric_scores else: results[f"{model.__class__.__name__}"] = metric_scores return results