Source code for tau_eval.models.authorship

import torch
from nltk.tokenize import sent_tokenize

from transformers import pipeline

from .anonymizer import Anonymizer


[docs] class KeepItSimple(Anonymizer): def __init__(self, name="Keep-It-Simple"): self.name = name self.pipeline = pipeline(model="philippelaban/keep_it_simple", max_new_tokens=512)
[docs] def anonymize(self, text: str) -> str: sentences = [t + "<|endoftext|>" for t in sent_tokenize(text)] anonymized_sentences = self.pipeline(sentences) anonymized_text = "" for i, sentence in enumerate(anonymized_sentences): anonymized_text += sentence[0]["generated_text"][len(sentences[i]) :] return anonymized_text
[docs] def anonymize_batch(self, texts: list[str]) -> list[str]: sentences = [] sentences_lenghts = [] for text in texts: split = sent_tokenize(text) sentences_lenghts.append(len(split)) sentences += [t + "<|endoftext|>" for t in split] anonymized_sentences = self.pipeline(sentences) anonymized_sentences = [ sentence[0]["generated_text"][len(sentences[i]) :] for i, sentence in enumerate(anonymized_sentences) ] anonymized_texts = [] for lenght in sentences_lenghts: text = " ".join([s for s in anonymized_sentences[:lenght]]) del anonymized_sentences[:lenght] anonymized_texts.append(text) return anonymized_texts
[docs] class Paraphraser(Anonymizer): def __init__(self, name="PegasusParaphrase"): self.name = name self.pipeline = pipeline( "text2text-generation", model="alykassem/FLAN-T5-Paraphraser", max_new_tokens=512, torch_dtype=torch.float16, )
[docs] def anonymize(self, text) -> str: sentences = sent_tokenize(text) anonymized_sentences = self.pipeline(sentences) anonymized_text = "" for i, sentence in enumerate(anonymized_sentences): anonymized_text += f" {sentence['generated_text']}" return anonymized_text
[docs] def anonymize_batch(self, texts) -> list[str]: sentences = [] sentences_lenghts = [] for text in texts: split = sent_tokenize(text) sentences_lenghts.append(len(split)) sentences += [t for t in split] anonymized_sentences = self.pipeline(sentences) anonymized_sentences = [sentence["generated_text"] for sentence in anonymized_sentences] anonymized_texts = [] for lenght in sentences_lenghts: text = " ".join([s for s in anonymized_sentences[:lenght]]) del anonymized_sentences[:lenght] anonymized_texts.append(text) return anonymized_texts
[docs] class M2M100MT(Anonymizer): def __init__(self, name="M2M100MT"): self.name = name self.pipeline_en_de = pipeline( "translation", "facebook/m2m100_418M", src_lang="en", tgt_lang="de", max_new_tokens=512 ) self.pipeline_de_fr = pipeline( "translation", "facebook/m2m100_418M", src_lang="de", tgt_lang="fr", max_new_tokens=512 ) self.pipeline_fr_en = pipeline( "translation", "facebook/m2m100_418M", src_lang="fr", tgt_lang="en", max_new_tokens=512 )
[docs] def anonymize(self, text: str) -> str: sentences = sent_tokenize(text) texts_de = [t["translation_text"] for t in self.pipeline_en_de(sentences)] texts_fr = [t["translation_text"] for t in self.pipeline_de_fr(texts_de)] texts_en = [t["translation_text"] for t in self.pipeline_fr_en(texts_fr)] anonymized_sentences = texts_en anonymized_text = "" for i, sentence in enumerate(anonymized_sentences): anonymized_text += f" {sentence}" return anonymized_text
[docs] def anonymize_batch(self, texts) -> list[str]: sentences = [] sentences_lenghts = [] for text in texts: split = sent_tokenize(text) sentences_lenghts.append(len(split)) sentences += [t for t in split] texts_de = [t["translation_text"] for t in self.pipeline_en_de(sentences)] texts_fr = [t["translation_text"] for t in self.pipeline_de_fr(texts_de)] texts_en = [t["translation_text"] for t in self.pipeline_fr_en(texts_fr)] anonymized_sentences = texts_en anonymized_sentences = [sentence for i, sentence in enumerate(anonymized_sentences)] anonymized_texts = [] for lenght in sentences_lenghts: text = " ".join([s for s in anonymized_sentences[:lenght]]) del anonymized_sentences[:lenght] anonymized_texts.append(text) return anonymized_texts