Learn basic techniques to remove or obscure PII from datasets while maintaining data utility for AI training.
Common Techniques:
- Masking: Replace with XXX or ***
- Tokenization: Replace with random tokens
- Generalization: Use broader categories
- Suppression: Remove sensitive fields
Apply advanced anonymization with k-anonymity, l-diversity, and t-closeness for robust privacy protection.
import pandas as pd
from anonymizedf.anonymizedf import anonymize
def apply_k_anonymity(df, quasi_identifiers, k=5):
hierarchies = {
'age': [[0,20], [20,40], [40,60], [60,100]],
'zipcode': lambda x: x[:3] + '**',
'date': lambda x: x.year
}
anon_df = anonymize(
df,
quasi_identifiers,
k=k,
hierarchies=hierarchies
)
groups = anon_df.groupby(quasi_identifiers).size()
assert groups.min() >= k, f"K-anonymity violated: min group size {groups.min()}"
return anon_df
Implement differential privacy, synthetic data generation, and privacy-preserving machine learning techniques.
import pydp as dp
from pydp.algorithms.laplacian import BoundedMean
class DifferentialPrivacyEngine:
def __init__(self, epsilon=1.0, delta=1e-5):
self.epsilon = epsilon
self.delta = delta
def private_mean(self, data, bounds):
mean_algorithm = BoundedMean(
epsilon=self.epsilon,
lower_bound=bounds[0],
upper_bound=bounds[1]
)
return mean_algorithm.quick_result(data)
def add_noise(self, value, sensitivity):
scale = sensitivity / self.epsilon
noise = dp.laplacian_mechanism(0, scale)
return value + noise