from faker import Faker
from presidio_analyzer import AnalyzerEngine
from presidio_anonymizer import AnonymizerEngine, OperatorConfig
from presidio_anonymizer.operators import Operator, OperatorType
from .anonymizer import Anonymizer
# Original code from https://microsoft.github.io/presidio/samples/python/pseudonymization/
[docs]
class InstanceCounterAnonymizer(Operator):
"""
Anonymizer which replaces the entity value
with an instance counter per entity.
"""
REPLACING_FORMAT = "<{entity_type}_{index}>"
[docs]
def operate(self, text: str, params: dict = None) -> str:
"""Anonymize the input text."""
entity_type: str = params["entity_type"]
# entity_mapping is a dict of dicts containing mappings per entity type
entity_mapping = params["entity_mapping"]
entity_mapping_for_type = entity_mapping.get(entity_type)
if not entity_mapping_for_type:
new_text = self.REPLACING_FORMAT.format(entity_type=entity_type, index=0)
entity_mapping[entity_type] = {}
else:
if text in entity_mapping_for_type:
return entity_mapping_for_type[text]
previous_index = self._get_last_index(entity_mapping_for_type)
new_text = self.REPLACING_FORMAT.format(entity_type=entity_type, index=previous_index + 1)
entity_mapping[entity_type][text] = new_text
return new_text
@staticmethod
def _get_last_index(entity_mapping_for_type: dict) -> int:
"""Get the last index for a given entity type."""
def get_index(value: str) -> int:
return int(value.split("_")[-1][:-1])
indices = [get_index(v) for v in entity_mapping_for_type.values()]
return max(indices)
[docs]
def validate(self, params: dict = None) -> None:
"""Validate operator parameters."""
if "entity_mapping" not in params:
raise ValueError("An input Dict called `entity_mapping` is required.")
if "entity_type" not in params:
raise ValueError("An entity_type param is required.")
[docs]
def operator_name(self) -> str:
return "entity_counter"
[docs]
def operator_type(self) -> OperatorType:
return OperatorType.Anonymize
[docs]
class UniquePlaceholderPerEntity(Anonymizer):
def __init__(self):
# Create Anonymizer engine and add the custom anonymizer
self.analyzer = AnalyzerEngine()
self.anonymizer_engine = AnonymizerEngine()
self.anonymizer_engine.add_anonymizer(InstanceCounterAnonymizer)
[docs]
def anonymize(self, text) -> str:
entity_mapping = {}
analyzer_results = self.analyzer.analyze(text=text, language="en")
anonymized_result = self.anonymizer_engine.anonymize(
text,
analyzer_results,
{"DEFAULT": OperatorConfig("entity_counter", {"entity_mapping": entity_mapping})},
)
return anonymized_result.text
[docs]
class DeletionAnonymizer(Operator):
"""
Anonymizer which deletes entities.
"""
REPLACING_FORMAT = ""
[docs]
def operate(self, text: str, params: dict = None) -> str:
"""Anonymize the input text."""
new_text = self.REPLACING_FORMAT
return new_text
@staticmethod
def _get_last_index(entity_mapping_for_type: dict) -> int:
"""Get the last index for a given entity type."""
def get_index(value: str) -> int:
return int(value.split("_")[-1][:-1])
indices = [get_index(v) for v in entity_mapping_for_type.values()]
return max(indices)
[docs]
def validate(self, params: dict = None) -> None:
"""Validate operator parameters."""
pass
[docs]
def operator_name(self) -> str:
return "entity_remover"
[docs]
def operator_type(self) -> OperatorType:
return OperatorType.Anonymize
[docs]
class EntityDeletion(Anonymizer):
def __init__(self):
# Create Anonymizer engine and add the custom anonymizer
self.analyzer = AnalyzerEngine()
self.anonymizer_engine = AnonymizerEngine()
self.anonymizer_engine.add_anonymizer(DeletionAnonymizer)
[docs]
def anonymize(self, text) -> str:
analyzer_results = self.analyzer.analyze(text=text, language="en")
anonymized_result = self.anonymizer_engine.anonymize(
text,
analyzer_results,
{"DEFAULT": OperatorConfig("entity_remover")},
)
return anonymized_result.text
[docs]
class PlaceholderAnonymizer(Operator):
"""
Anonymizer which replaces the entity value
with a <ENTITY> placeholder.
"""
REPLACING_FORMAT = "<ENTITY>"
[docs]
def operate(self, text: str, params: dict = None) -> str:
"""Anonymize the input text."""
new_text = self.REPLACING_FORMAT
return new_text
@staticmethod
def _get_last_index(entity_mapping_for_type: dict) -> int:
"""Get the last index for a given entity type."""
def get_index(value: str) -> int:
return int(value.split("_")[-1][:-1])
indices = [get_index(v) for v in entity_mapping_for_type.values()]
return max(indices)
[docs]
def validate(self, params: dict = None) -> None:
"""Validate operator parameters."""
pass
[docs]
def operator_name(self) -> str:
return "entity_placeholder"
[docs]
def operator_type(self) -> OperatorType:
return OperatorType.Anonymize
[docs]
class CategoryAnonymizer(Operator):
"""
Anonymizer which replaces the entity value
with the associated category.
"""
REPLACING_FORMAT = "<{entity_type}>"
[docs]
def operate(self, text: str, params: dict = None) -> str:
"""Anonymize the input text."""
entity_type: str = params["entity_type"]
# entity_mapping is a dict of dicts containing mappings per entity type
entity_mapping = params["entity_mapping"]
entity_mapping_for_type = entity_mapping.get(entity_type)
if not entity_mapping_for_type:
new_text = self.REPLACING_FORMAT.format(entity_type=entity_type)
entity_mapping[entity_type] = {}
else:
if text in entity_mapping_for_type:
return entity_mapping_for_type[text]
new_text = self.REPLACING_FORMAT.format(entity_type=entity_type)
entity_mapping[entity_type][text] = new_text
return new_text
[docs]
def validate(self, params: dict = None) -> None:
"""Validate operator parameters."""
if "entity_mapping" not in params:
raise ValueError("An input Dict called `entity_mapping` is required.")
if "entity_type" not in params:
raise ValueError("An entity_type param is required.")
[docs]
def operator_name(self) -> str:
return "entity_category"
[docs]
def operator_type(self) -> OperatorType:
return OperatorType.Anonymize
[docs]
class CategoryPlaceholder(Anonymizer):
def __init__(self):
# Create Anonymizer engine and add the custom anonymizer
self.analyzer = AnalyzerEngine()
self.anonymizer_engine = AnonymizerEngine()
self.anonymizer_engine.add_anonymizer(CategoryAnonymizer)
[docs]
def anonymize(self, text) -> str:
entity_mapping = {}
analyzer_results = self.analyzer.analyze(text=text, language="en")
anonymized_result = self.anonymizer_engine.anonymize(
text,
analyzer_results,
{"DEFAULT": OperatorConfig("entity_category", {"entity_mapping": entity_mapping})},
)
return anonymized_result.text
[docs]
class FakerAnonymizer(Operator):
"""
Anonymizer which replaces the entity value
with a Faker-generated fake entity value.
"""
def __init__(self):
self.faker = Faker()
[docs]
def operate(self, text: str, params: dict = None) -> str:
"""Anonymize the input text with Faker values."""
entity_type: str = params["entity_type"]
# entity_mapping is a dict of dicts containing mappings per entity type
entity_mapping = params["entity_mapping"]
entity_mapping_for_type = entity_mapping.get(entity_type)
if not entity_mapping_for_type:
entity_mapping[entity_type] = {}
entity_mapping_for_type = entity_mapping[entity_type]
if text in entity_mapping_for_type:
return entity_mapping_for_type[text]
# Generate appropriate fake data based on entity type
fake_value = self._generate_fake_value(entity_type)
entity_mapping[entity_type][text] = fake_value
return fake_value
def _generate_fake_value(self, entity_type: str) -> str:
"""Generate appropriate fake value based on entity type."""
entity_type = entity_type.lower()
if "person" in entity_type or "name" in entity_type:
return self.faker.name()
elif "phone" in entity_type:
return self.faker.phone_number()
elif "email" in entity_type:
return self.faker.email()
elif "address" in entity_type:
return self.faker.address().replace("\n", ", ")
elif "credit" in entity_type or "card" in entity_type:
return self.faker.credit_card_number()
elif "ssn" in entity_type or "social" in entity_type:
return self.faker.ssn()
elif "date" in entity_type or "birth" in entity_type:
return self.faker.date()
elif "ip" in entity_type:
return self.faker.ipv4()
elif "url" in entity_type:
return self.faker.url()
elif "company" in entity_type or "org" in entity_type:
return self.faker.company()
elif "location" in entity_type or "city" in entity_type:
return self.faker.city()
elif "country" in entity_type:
return self.faker.country()
elif "iban" in entity_type:
return self.faker.iban()
elif "passport" in entity_type:
return self.faker.passport_number()
else:
# Default for unknown entity types
return self.faker.word()
[docs]
def validate(self, params: dict = None) -> None:
"""Validate operator parameters."""
if "entity_mapping" not in params:
raise ValueError("An input Dict called `entity_mapping` is required.")
if "entity_type" not in params:
raise ValueError("An entity_type param is required.")
[docs]
def operator_name(self) -> str:
return "faker_anonymizer"
[docs]
def operator_type(self) -> OperatorType:
return OperatorType.Anonymize
[docs]
class FakerPlaceholder(Anonymizer):
def __init__(self):
# Create Anonymizer engine and add the custom anonymizer
self.analyzer = AnalyzerEngine()
self.anonymizer_engine = AnonymizerEngine()
self.anonymizer_engine.add_anonymizer(FakerAnonymizer)
[docs]
def anonymize(self, text) -> str:
entity_mapping = {}
analyzer_results = self.analyzer.analyze(text=text, language="en")
anonymized_result = self.anonymizer_engine.anonymize(
text,
analyzer_results,
{"DEFAULT": OperatorConfig("faker_anonymizer", {"entity_mapping": entity_mapping})},
)
return anonymized_result.text