From 09e0ceee2e86d9c4031972e1d03ae4587cba00db Mon Sep 17 00:00:00 2001 From: mohsine hajar Date: Mon, 23 Jun 2025 16:34:23 +0100 Subject: [PATCH] feat: enable privacy evaluation for models trained externally --- .../user_guide/privacy_estimation/privacy.rst | 106 +++++++++++++++++ .../privacy_estimation/attack_runner.py | 3 +- guardian_ai/privacy_estimation/dataset.py | 110 +++++++++++++++++- tests/unitary/test_privacy_attacks.py | 37 ++++++ 4 files changed, 252 insertions(+), 4 deletions(-) diff --git a/docs/source/user_guide/privacy_estimation/privacy.rst b/docs/source/user_guide/privacy_estimation/privacy.rst index 730b07f..c3ea116 100644 --- a/docs/source/user_guide/privacy_estimation/privacy.rst +++ b/docs/source/user_guide/privacy_estimation/privacy.rst @@ -290,3 +290,109 @@ by the given metric and print out the best attacks for each dataset for each mod graphs_dir=graph_dir, metric_to_sort_on="attack_accuracy", ) + + +************************************ +Evaluating Externally Trained Models +************************************ + +This section outlines how to assess the privacy risk of a model trained outside the Guardian AI framework + + +Step 1: Load Your Data +---------------------- + +Load the data used to train your model and a similar dataset not used in training ( CSV files, Dataframes ) + +.. code-block:: python + + df_x_in = pd.read_csv("in_data.csv") # Features from training data + df_y_in = pd.read_csv("in_labels.csv", header=None).squeeze() # Labels from training data + df_x_out = pd.read_csv("out_data.csv") # Features from non-training data + df_y_out = pd.read_csv("out_labels.csv", header=None).squeeze() # Labels from non-training data + + +Step 2: Prepare Attack Splits +----------------------------- + +Use the ``prepare_attack_data_for_pretrained_model`` method to create attack-specific data splits: + +.. code-block:: python + + from guardian_ai.privacy_estimation.dataset import ClassificationDataset, DataSplit + + dataset = ClassificationDataset("your_dataset_name") + dataset.prepare_attack_data_for_pretrained_model( + data_split_seed=42, + dataset_split_ratios={ + DataSplit.ATTACK_TRAIN_IN: 0.3, + DataSplit.ATTACK_TEST_IN: 0.7, + DataSplit.ATTACK_TRAIN_OUT: 0.3, + DataSplit.ATTACK_TEST_OUT: 0.7, + }, + df_x_in=df_x_in, + df_y_in=df_y_in, + df_x_out=df_x_out, + df_y_out=df_y_out + ) + + +Step 3: Wrap Your Model +----------------------- + +Wrap your pretrained model to make it compatible with the framework: + +.. code-block:: python + + from guardian_ai.privacy_estimation.model import TargetModel + + class ExternalTargetModel(TargetModel): + """ + Wrapper for external pretrained models. + """ + def __init__(self, model): + self.model = model + + def get_model(self): + return self.model + + def get_model_name(self): + return "external_model" + + def get_prediction_probs(self, X): + return self.model.predict_proba(X) + + +Step 4: Register Attacks and Run Evaluation +------------------------------------------- + +Instantiate the attack runner and execute the evaluation: + +.. code-block:: python + + # Initialize attack runner + attack_runner = AttackRunner( + dataset=dataset, + target_models=[ExternalTargetModel(your_external_model)], + attacks=[ + AttackType.LossBasedBlackBoxAttack, + AttackType.ConfidenceBasedBlackBoxAttack, + AttackType.MerlinAttack + ], + threshold_grids={AttackType.MerlinAttack.name: [0.001, 0.01, 0.1]} + ) + + results = attack_runner.run_attack( + target_model=ExternalTargetModel(your_external_model), + attack_type=AttackType.MerlinAttack, + metric_functions=["precision", "recall", "f1", "accuracy"], + cache_input=True + ) + + +Notes: +------ + +1. Data Preprocessing: Ensure ``df_x_in`` and ``df_x_out`` are preprocessed identically to how they were during the model's training +2. Split Ratios: The sum of ``ATTACK_TRAIN_IN`` + ``ATTACK_TEST_IN`` and ``ATTACK_TRAIN_OUT`` + ``ATTACK_TEST_OUT`` must equal ``1.0`` +3. Model Compatibility: The external model must support a ``predict_proba`` method \ No newline at end of file diff --git a/guardian_ai/privacy_estimation/attack_runner.py b/guardian_ai/privacy_estimation/attack_runner.py index 11d503d..92ce4a8 100644 --- a/guardian_ai/privacy_estimation/attack_runner.py +++ b/guardian_ai/privacy_estimation/attack_runner.py @@ -60,7 +60,7 @@ def __init__( AttackRunner """ self.dataset = dataset - assert self.dataset.target_model_data is not None + #assert self.dataset.target_model_data is not None assert self.dataset.attack_model_data is not None self.target_models = target_models self.attacks = attacks @@ -70,6 +70,7 @@ def __init__( def train_target_models(self): for target_model in self.target_models: + assert self.dataset.target_model_data is not None print("Target Model: " + target_model.get_model_name()) target_model_data: TargetModelData = self.dataset.target_model_data classifier = target_model.train_model( diff --git a/guardian_ai/privacy_estimation/dataset.py b/guardian_ai/privacy_estimation/dataset.py index d00baa3..8eb3f90 100644 --- a/guardian_ai/privacy_estimation/dataset.py +++ b/guardian_ai/privacy_estimation/dataset.py @@ -168,7 +168,13 @@ def __init__(self, name: str = None, df_x=None, df_y=None): self.df_x, self.df_y = df_x, df_y self.splits = {} - def split_dataset(self, seed: int, split_array: List[float], split_names: List[str] = None): + def split_dataset(self, + seed: int, + split_array: List[float], + split_names: List[str] = None, + df_x: pd.DataFrame = None, + df_y: pd.Series = None + ): """ Splits dataset according to the specified fractions. @@ -180,6 +186,10 @@ def split_dataset(self, seed: int, split_array: List[float], split_names: List[s Array of fractions to split the data in. Must sum to 1. split_names: List[str] Names assigned to the splits. + df_x: pd.DataFrame, optional + If provided, use this instead of self.df_x + df_y: pd.Series, optional + If provided, use this instead of self.df_y Returns ------- @@ -193,8 +203,8 @@ def split_dataset(self, seed: int, split_array: List[float], split_names: List[s assert len(split_array) == len(split_names) x_2, y_2 = ( - self.df_x, - self.df_y, + df_x if df_x is not None else self.df_x, + df_y if df_y is not None else self.df_y, ) # using these variables as portion to be split next test_size = np.sum(split_array[1:]) for i in range(len(split_array)): @@ -686,3 +696,97 @@ def prepare_target_and_attack_data( y_attack_test, y_membership_test, ) + + def prepare_attack_data_for_pretrained_model( + self, + data_split_seed: int, + dataset_split_ratios: Dict[DataSplit, float], + df_x_in: pd.DataFrame, + df_y_in: pd.Series, + df_x_out: pd.DataFrame, + df_y_out: pd.Series + ) -> None: + """ + Prepares attack splits for pretrained models using external data sources. + + Parameters + ---------- + data_split_seed : int + Random seed for reproducibility + dataset_split_ratios : Dict[DataSplit, float] + Ratios for splitting in/out data into attack sets + df_x_in : pd.DataFrame + Features from the model's training data + df_y_in : pd.Series + Labels from the model's training data + df_x_out : pd.DataFrame + Features from non-training data + df_y_out : pd.Series + Labels from non-training data + + Returns + ------- + None + + """ + assert abs(dataset_split_ratios[DataSplit.ATTACK_TRAIN_IN] + + dataset_split_ratios[DataSplit.ATTACK_TEST_IN] - 1.0) < 1e-6, \ + "In-data ratios must sum to 1.0" + + assert abs(dataset_split_ratios[DataSplit.ATTACK_TRAIN_OUT] + + dataset_split_ratios[DataSplit.ATTACK_TEST_OUT] - 1.0) < 1e-6, \ + "Out-data ratios must sum to 1.0" + + + # Split in-data (model's training data) + in_dataset = ClassificationDataset(name="in_data") + in_dataset.load_data_from_df(df_x_in, df_y_in) + in_dataset.split_dataset( + seed=data_split_seed, + split_array=[ + dataset_split_ratios[DataSplit.ATTACK_TRAIN_IN], + dataset_split_ratios[DataSplit.ATTACK_TEST_IN] + ], + split_names=[ + DataSplit.ATTACK_TRAIN_IN.name, + DataSplit.ATTACK_TEST_IN.name + ] + ) + + # Split out-data (non-training data) + out_dataset = ClassificationDataset(name="out_data") + out_dataset.load_data_from_df(df_x_out, df_y_out) + out_dataset.split_dataset( + seed=data_split_seed, + split_array=[ + dataset_split_ratios[DataSplit.ATTACK_TRAIN_OUT], + dataset_split_ratios[DataSplit.ATTACK_TEST_OUT] + ], + split_names=[ + DataSplit.ATTACK_TRAIN_OUT.name, + DataSplit.ATTACK_TEST_OUT.name + ] + ) + + # Merge splits into main dataset + self.splits.update(in_dataset.splits) + self.splits.update(out_dataset.splits) + + X_attack_train, y_attack_train, y_membership_train = self.create_attack_set_from_splits( + DataSplit.ATTACK_TRAIN_IN.name, + DataSplit.ATTACK_TRAIN_OUT.name + ) + + X_attack_test, y_attack_test, y_membership_test = self.create_attack_set_from_splits( + DataSplit.ATTACK_TEST_IN.name, + DataSplit.ATTACK_TEST_OUT.name + ) + + self.attack_model_data = AttackModelData( + X_attack_train=X_attack_train, + y_attack_train=y_attack_train, + y_membership_train=y_membership_train, + X_attack_test=X_attack_test, + y_attack_test=y_attack_test, + y_membership_test=y_membership_test + ) diff --git a/tests/unitary/test_privacy_attacks.py b/tests/unitary/test_privacy_attacks.py index 2320afc..68c93af 100644 --- a/tests/unitary/test_privacy_attacks.py +++ b/tests/unitary/test_privacy_attacks.py @@ -147,6 +147,43 @@ def test_prepare_target_and_attack_data(dataset, dataset_split_ratios): assert attack_model_data.X_attack_test.get_shape() == (199, 30) +def test_prepare_attack_splits_for_pretrained(dataset, dataset_split_ratios): + # Split dataset's data into in and out samples + df_x_in = dataset.df_x.iloc[:250] + df_y_in = dataset.df_y.iloc[:250] + df_x_out = dataset.df_x.iloc[250:] + df_y_out = dataset.df_y.iloc[250:] + + # Define dataset split ratios + dataset_split_ratios_for_pretrained_model = { + DataSplit.ATTACK_TRAIN_IN: 0.6, #150 + DataSplit.ATTACK_TEST_IN: 0.4, #100 + DataSplit.ATTACK_TRAIN_OUT: 0.5, #125 + DataSplit.ATTACK_TEST_OUT: 0.5, #125 + } + + # Call the method to prepare attack splits for a pretrained model + dataset.prepare_attack_data_for_pretrained_model( + data_split_seed=42, + dataset_split_ratios=dataset_split_ratios_for_pretrained_model, + df_x_in=df_x_in, + df_y_in=df_y_in, + df_x_out=df_x_out, + df_y_out=df_y_out + ) + + # Check required attack splits exist + assert DataSplit.ATTACK_TRAIN_IN.name in dataset.splits + assert DataSplit.ATTACK_TEST_IN.name in dataset.splits + assert DataSplit.ATTACK_TRAIN_OUT.name in dataset.splits + assert DataSplit.ATTACK_TEST_OUT.name in dataset.splits + + # Verify attack model data is initialized and has correct sizes + attack_model_data = dataset.attack_model_data + assert attack_model_data is not None + assert attack_model_data.X_attack_train.shape[0] == 275 # 150 (in) + 125 (out) + assert attack_model_data.X_attack_test.shape[0] == 225 # 100 (in) + 125 (out) + @pytest.mark.skip(reason="random state was not added while creating unit testing") def test_run_attack(attack_runner, metric_functions): cache_input = (