diff --git a/.github/workflows/conda-publish-manual.yml b/.github/workflows/conda-publish-manual.yml index d11c0f27..b6322564 100644 --- a/.github/workflows/conda-publish-manual.yml +++ b/.github/workflows/conda-publish-manual.yml @@ -30,8 +30,7 @@ jobs: build-args: --target-platform ${{ matrix.target-platform }}${{ matrix.target-platform == 'linux-aarch64' && ' --no-test' || '' }} - name: Show built packages run: | - ls -lh rattler-build/**/*.conda || true + ls -lh output/**/*.conda || true - name: Upload package to Anaconda run: | - export ANACONDA_API_KEY=${{ secrets.ANACONDA_TOKEN }} - rattler-build upload anaconda -o netzoo rattler-build/**/*.conda \ No newline at end of file + rattler-build upload anaconda -o netzoo --api-key ${{ secrets.ANACONDA_TOKEN }} output/**/*.conda diff --git a/.gitignore b/.gitignore index d837f149..39e6f10c 100644 --- a/.gitignore +++ b/.gitignore @@ -58,4 +58,8 @@ debug_bonobo.py !recipe/ !recipe/recipe.yaml -!recipe/test.py \ No newline at end of file +!recipe/test.py + +pixi/* +.pixi/* +pixi* diff --git a/netZooPy/giraffe/__init__.py b/netZooPy/giraffe/__init__.py new file mode 100644 index 00000000..6193abc5 --- /dev/null +++ b/netZooPy/giraffe/__init__.py @@ -0,0 +1,4 @@ +from __future__ import absolute_import + +from .giraffe import Giraffe +from .utils import * diff --git a/netZooPy/giraffe/giraffe.py b/netZooPy/giraffe/giraffe.py new file mode 100644 index 00000000..690f9043 --- /dev/null +++ b/netZooPy/giraffe/giraffe.py @@ -0,0 +1,319 @@ +import random +#from sklearn.linear_model import LinearRegression +import torch +from torch import nn +from torch.functional import F +from typing import Callable +from netZooPy.cobra import * +from netZooPy.giraffe.utils import * +import logging +import pandas as pd + +from torch.optim import Adam +from torch.optim.optimizer import required + + +class Giraffe(object): + """ + GIRAFFE infers regulation and transcription factor activity using + + gene expression + TF DNA binding motif + TF interactions (PPI) + + while (optionally) controlling for covariates of interest (e.g. age). + There are G genes, TF transcription factors, and n samples (cells, patients, ...) + + Parameters + ----------- + expression : numpy array or pandas dataframe + Dimension G x n + prior : numpy array or pandas dataframe + TF-gene regulatory network based on TF motifs as a + matrix of size (tf,g), g=number of genes, tf=number of TFs + ppi : numpy array or pandas dataframe + TF-TF protein interaction network as a matrix of size (tf, tf) + Must be symmetrical and have ones on the diagonal + design_matrix : np.ndarray, pd.DataFrame + COBRA design matrix of size (n, q), n = number of samples, q = number of covariates + cobra_covariate_to_keep : int + Zero-indedex base of COBRA co-expression component to use + iterations : int + number of iterations of the algorithm + lr : float + the learning rate + lam: + + Public Functions + --------- + get_regulation : numpy array + Returns the predicted TF-gene complete regulatory network as an adjacency matrix of size (tf,g) + get_tfa : numpy array + Returns the predicted transcription factor activity as an adjacency matrix of size (tf,n) + + References + ------------- + Author: Soel Micheletti + """ + + def __init__( + self, + expression, + prior, + ppi, + adjusting=None, + design_matrix=None, + cobra_covariate_to_keep=0, + l1=0, + max_iter=2000, + min_iter=200, + lr=1e-5, + lam=None, + balance_fn = None, + save_computation = False, + seed=42, # For reproducibility + ) -> None: + + torch.manual_seed(seed) + random.seed(seed) + np.random.seed(seed) + self._save_computation = save_computation + self.process_data( + expression, + prior, + ppi, + adjusting, + design_matrix, + cobra_covariate_to_keep + ) + self._max_iter = max_iter + self._min_iter = min_iter + self._lr = lr + self._l = l1 + self._lam = lam + self._balance_fn = balance_fn + if self._lam is not None and self._balance_fn is not None: + raise ValueError("Can't set both custom weights and custom weight learning function! Please provide only one of both options. ") + self._R, self._TFA = self._compute_giraffe() + + def process_data( + self, + expression, + motif, + ppi, + adjusting, + design_matrix, + cobra_covariate_to_keep + ): + """ + Processes data files into normzalized data matrices. + + :param expression: numpy matrix or pandas dataframe containing gene expression data. + :param motif: numpy matrix or pandas dataframe containing motif data. + :param ppi: numpy or pandas dataframe containing PPI data. Must be symmetrical. + :param adjusting: vector of covariates that needs to be adjusted. + """ + self._expression = expression + self._motif = motif + self._ppi = ppi + + if isinstance(expression, pd.DataFrame): + self._expression = expression.to_numpy() + if isinstance(motif, pd.DataFrame): + self._motif = motif.to_numpy() + if isinstance(ppi, pd.DataFrame): + self._ppi = ppi.to_numpy() + + check_symmetric(self._ppi) # Check that ppi has legal structure. + + if not isinstance(self._expression, np.ndarray): + raise ValueError( + "Error in processing expression data. Please provide a numpy array or a pandas dataframe. " + ) + if not isinstance(self._motif, np.ndarray): + raise ValueError( + "Error in processing motif data. Please provide a numpy array or a pandas dataframe. " + ) + if not isinstance(self._ppi, np.ndarray): + raise ValueError( + "Error in processing PPI data. Please provide a numpy array or a pandas dataframe. " + ) + + if adjusting is None: + self._adjusting = None + else: + if len(adjusting.shape) == 1: + adjusting = adjusting.reshape(len(adjusting), 1) + self._adjusting = torch.Tensor(np.zeros(adjusting.shape)) + if isinstance(adjusting, pd.DataFrame): + adjusting = adjusting.to_numpy() + if adjusting is not None and not isinstance(adjusting, np.ndarray): + raise ValueError( + "Error in processing adjusting covariates. Please provide a numpy array or a pandas dataframe. " + ) + if adjusting is not None: + for i in range(adjusting.shape[1]): + tmp = torch.Tensor(adjusting[:, i]) + tmp /= torch.norm(tmp) + self._adjusting[:, i] = tmp + + # Normalize motif and PPI + self._motif = motif / np.sqrt(np.trace(motif.T @ motif)) + self._ppi = self._ppi / np.trace(self._ppi) + + self._C = 0 + if not self._save_computation: + # Compute co-expression matrix + if (np.cov(self._expression).diagonal() == 0).any(): + self._expression += 1e-8 + + if design_matrix is not None: + psi, Q, d, g = cobra(design_matrix, self._expression) + if cobra_covariate_to_keep < 0 or cobra_covariate_to_keep >= psi.shape[0]: + raise AttributeError( + "Invalid COBRA component! Valid COBRA components are in range " + str(0) + " - " + str(psi.shape[0] - 1) + ) + self._C = Q.dot(np.diag(psi[cobra_covariate_to_keep,])).dot(Q.T) + else: + self._C = np.corrcoef(self._expression) + self._C /= np.trace(self._C) + + if design_matrix is not None: + self._expression = limma(exprs=self._expression, pheno=pd.DataFrame(design_matrix), covariate_formula = "~ -1") + + def _compute_giraffe(self): + """ + Giraffe optimization. + :return: R : matrix g x tf, partial effects between tanscription factor and gene. + TFA : matrix tf x n, transcrption factor activity. + """ + + variables_to_adjust = 0 + if self._adjusting is not None: + variables_to_adjust = self._adjusting.shape[1] + + giraffe_model = Model(np.random.random((self._motif.shape[1], self._expression.shape[1])), self._motif, variables_to_adjust) + optim = torch.optim.Adam(giraffe_model.parameters(), lr=self._lr) # We run Adam to optimize f(R, TFA) + if self._l > 0: + optim = ProxAdam(giraffe_model.parameters(), lr=self._lr, lambda_ = self._l) + + last_loss = float('inf') + converged = False + for i in range(self._max_iter): + pred = giraffe_model( + torch.Tensor(self._expression), + torch.Tensor(self._ppi), + torch.Tensor(self._C), + self._lam, + self._balance_fn, + self._adjusting, + self._save_computation + ) # Compute f(R, TFA) + loss = F.mse_loss(pred, torch.norm( + torch.Tensor(np.zeros((3, 3))))).sqrt() # Minimization problem: loss = ||f(R, TFA)|| + optim.zero_grad() # Reset gradients + loss.backward() + optim.step() # Adam step + if i >= self._min_iter and abs(last_loss - loss.detach().numpy()) / last_loss < 1e-3: + converged=True + break + last_loss = loss.detach().numpy() + + if not converged: + logging.warn('GIRAFFE did not converge and was stopped after ' + str(self._max_iter) + ' iterations. Try increasing the max_iter parameter.') + R = giraffe_model.R.detach().numpy() + TFA = torch.abs(giraffe_model.TFA.detach()).numpy() + return R, TFA + + def get_regulation(self): + return self._R + + def get_tfa(self): + return self._TFA + # TFA_giraffe = np.zeros(self._TFA.shape) + # for i in range(self._TFA.shape[1]): + # TFA_giraffe[:, i] = LinearRegression(fit_intercept=False, positive=True).fit(self._R, self._expression[:, i]).coef_ + # return TFA_giraffe + + +class Model(nn.Module): + def __init__( + self, + tfa_prior, + motif, + variables_to_adjust + ): + super().__init__() + self.TFA = nn.Parameter(torch.Tensor(tfa_prior)) + self.R = nn.Parameter(torch.Tensor(motif)) + self.variables_to_adjust = variables_to_adjust + self.coefs = nn.Parameter(torch.Tensor(torch.ones((motif.shape[0], self.variables_to_adjust)))) + + def forward( + self, + Y, + PPI, + C, + lam, + balance_fn, + adjusting, + save_computation + ): + if adjusting is None: + L1 = torch.norm(Y - torch.matmul(self.R, torch.abs(self.TFA))) ** 2 + L2 = torch.norm(torch.matmul(torch.t(self.R), self.R) - PPI) ** 2 + L3 = torch.norm(torch.Tensor(torch.zeros((2, 2)))) + if not save_computation: + L3 = torch.norm(torch.matmul(self.R, torch.t(self.R)) - C) ** 2 + L4 = torch.norm(torch.matmul(torch.abs(self.TFA), torch.t(torch.abs(self.TFA))) - PPI) ** 2 + L5 = torch.norm(self.R) ** 2 + weights = self._get_weights(lam, balance_fn, L1, L2, L3, L4, L5) + return weights[0] * L1 + weights[1] * L2 + weights[2] * L3 + weights[3] * L4 + weights[4] * L5 + else : + L1 = torch.norm(Y - torch.matmul(torch.hstack([self.R, self.coefs]), torch.vstack([torch.abs(self.TFA), torch.t(adjusting)]))) ** 2 + L2 = torch.norm(torch.matmul(torch.t(self.R), self.R) - PPI) ** 2 + L3 = torch.norm(torch.Tensor(torch.zeros((2, 2)))) + if not save_computation: + L3 = torch.norm(torch.matmul(self.R, torch.t(self.R)) - C) ** 2 + L4 = torch.norm(torch.matmul(torch.vstack([torch.abs(self.TFA), torch.t(adjusting)]), torch.t(torch.vstack([torch.abs(self.TFA), torch.t(adjusting)]))) - torch.hstack([torch.vstack([PPI, torch.Tensor(np.ones((self.variables_to_adjust, self.R.shape[1])))]), torch.Tensor(torch.ones((self.R.shape[1] + self.variables_to_adjust, self.variables_to_adjust)))])) ** 2 + L5 = torch.norm(torch.hstack([self.R, self.coefs])) ** 2 + weights = self._get_weights(lam, balance_fn, L1, L2, L3, 0, torch.Tensor([0])) + return weights[0] * L1 + 3 * weights[1] * L2 + weights[2] * L3 + weights[3] * L4 + weights[4] * L5 + + def _get_weights(self, lam, balance_fn, L1, L2, L3, L4, L5): + weights = [1, 1, 1, 1, 1] + if lam is not None: + weights = lam + elif balance_fn is not None: + weights = balance_fn(L1, L2, L3, L5) + else: + sum = L1.item() + L2.item() + L3.item() + L5.item() + weights = [1 - L1.item() / sum, 1 - L2.item() / sum, 1 - L3.item() / sum, 1, 1] + return weights + +class ProxAdam(Adam ): + def __init__(self, params, lr=required, lambda_=0): + + kwargs = dict(lr=lr) + super().__init__(params, **kwargs) + self._lambda = lambda_ + proxs = [self.soft_thresholding] + if len(proxs) != len(self.param_groups): + raise ValueError("Invalid length of argument proxs: {} instead of {}".format(len(proxs), len(self.param_groups))) + + for group, prox in zip(self.param_groups, list(proxs)): + group.setdefault('prox', prox) + + def step(self): + # perform a gradient step + super().step() + + for group in self.param_groups: + prox = group['prox'] + + # apply the proximal operator to each parameter in a group + for p in group['params']: + p.data = prox(p.data) + + def soft_thresholding(self, x): + return torch.sign(x) * torch.nn.functional.relu(torch.abs(x) - self._lambda) diff --git a/netZooPy/giraffe/utils.py b/netZooPy/giraffe/utils.py new file mode 100644 index 00000000..d648cf29 --- /dev/null +++ b/netZooPy/giraffe/utils.py @@ -0,0 +1,35 @@ +import numpy as np +import patsy + + +def check_symmetric(ppi): + """ + Raises an exception if ppi does not have the expected structure. + + :param ppi: matrix to be checked + """ + if ppi.shape[0] != ppi.shape[1]: + raise Exception( + "PPI must be a squared matrix. " + ) + if np.diag(ppi).any() != 1: + raise Exception( + "PPI matrix must have ones on the diagonal. " + ) + if np.any(np.abs(ppi - ppi.T) > 1e-8): + raise Exception( + "PPI matrix must be symmetrical. " + ) + +def limma(pheno, exprs, covariate_formula, design_formula='1', rcond=1e-8): + design_matrix = patsy.dmatrix(design_formula, pheno) + + design_matrix = design_matrix[:,1:] + rowsum = design_matrix.sum(axis=1) -1 + design_matrix=(design_matrix.T+rowsum).T + + covariate_matrix = patsy.dmatrix(covariate_formula, pheno) + design_batch = np.hstack((covariate_matrix,design_matrix)) + coefficients, res, rank, s = np.linalg.lstsq(design_batch, exprs.T, rcond=rcond) + beta = coefficients[-design_matrix.shape[1]:] + return exprs - design_matrix.dot(beta).T diff --git a/netZooPy/panda/panda.py b/netZooPy/panda/panda.py index ed959ea7..4f17a161 100755 --- a/netZooPy/panda/panda.py +++ b/netZooPy/panda/panda.py @@ -6,6 +6,7 @@ import numpy as np from netZooPy.cobra import cobra from netZooPy.panda import calculations as calc +from netZooPy.panda import io as io import os class Panda(object): @@ -124,7 +125,8 @@ def __init__( with_header=False, cobra_design_matrix = None, cobra_covariate_to_keep = 0, - tmp_folder = './tmp/' + tmp_folder = './tmp/', + process_data_only = False ): """ Intialize instance of Panda class and load data. """ @@ -146,72 +148,73 @@ def __init__( print(modeProcess,motif_file,expression_file,ppi_file,save_memory,remove_missing,keep_expression_matrix) if hasattr(self, "export_panda_results"): return - - # ===================================================================== - # Network normalization - # ===================================================================== - - self.precision = precision - with Timer("Normalizing networks ..."): - self.correlation_matrix = self._normalize_network(self.correlation_matrix) - with np.errstate(invalid="ignore"): # silly warning bothering people - self.motif_matrix = self._normalize_network( - self.motif_matrix_unnormalized + if (process_data_only==False): + # ===================================================================== + # Network normalization + # ===================================================================== + + self.precision = precision + + with Timer("Normalizing networks ..."): + self.correlation_matrix = self._normalize_network(self.correlation_matrix) + with np.errstate(invalid="ignore"): # silly warning bothering people + self.motif_matrix = self._normalize_network( + self.motif_matrix_unnormalized + ) + self.ppi_matrix = self._normalize_network(self.ppi_matrix) + if self.precision == "single": + self.correlation_matrix = np.float32(self.correlation_matrix) + self.motif_matrix = np.float32(self.motif_matrix) + self.ppi_matrix = np.float32(self.ppi_matrix) + # ===================================================================== + # Clean up useless variables to release memory + # ===================================================================== + self.tfs, self.genes = self.unique_tfs, self.gene_names + + if (save_memory==True): + print("Clearing motif and ppi data, unique tfs, and gene names for speed") + del self.unique_tfs, self.gene_names, self.motif_matrix_unnormalized + print("WARNING: save_memory will be uncoupled from the output behavior.\n Pass as_adjacency to save the output panda as adjacency matrix") + # ===================================================================== + # Saving middle data to tmp + # ===================================================================== + self.tmp_folder = tmp_folder + if save_tmp: + with Timer("Saving expression matrix and normalized networks in %s..." %str(self.tmp_folder)): + os.makedirs(self.tmp_folder,exist_ok=True) + if self.expression_data is not None: + np.save(self.tmp_folder + "expression.npy", self.expression_data.values) + np.save(self.tmp_folder + "motif.normalized.npy", self.motif_matrix) + np.save(self.tmp_folder + "ppi.normalized.npy", self.ppi_matrix) + + + # delete expression data + del self.expression_data + + # ===================================================================== + # Running PANDA algorithm + # ===================================================================== + if self.motif_data is not None: + print("Running PANDA algorithm ...") + self.panda_network = self.panda_loop( + self.correlation_matrix, + self.motif_matrix, + self.ppi_matrix, + computing, + alpha, + ) + # label dataframe + self.panda_network = pd.DataFrame( + self.panda_network, index=self.tfs, columns=self.genes + ) + else: + self.panda_network = self.correlation_matrix + self.__pearson_results_data_frame() + # label dataframe + self.panda_network = pd.DataFrame( + self.panda_network, index=self.genes, columns=self.genes ) - self.ppi_matrix = self._normalize_network(self.ppi_matrix) - if self.precision == "single": - self.correlation_matrix = np.float32(self.correlation_matrix) - self.motif_matrix = np.float32(self.motif_matrix) - self.ppi_matrix = np.float32(self.ppi_matrix) - # ===================================================================== - # Clean up useless variables to release memory - # ===================================================================== - self.tfs, self.genes = self.unique_tfs, self.gene_names - - if (save_memory==True): - print("Clearing motif and ppi data, unique tfs, and gene names for speed") - del self.unique_tfs, self.gene_names, self.motif_matrix_unnormalized - print("WARNING: save_memory will be uncoupled from the output behavior.\n Pass as_adjacency to save the output panda as adjacency matrix") - # ===================================================================== - # Saving middle data to tmp - # ===================================================================== - self.tmp_folder = tmp_folder - if save_tmp: - with Timer("Saving expression matrix and normalized networks in %s..." %str(self.tmp_folder)): - os.makedirs(self.tmp_folder,exist_ok=True) - if self.expression_data is not None: - np.save(self.tmp_folder + "expression.npy", self.expression_data.values) - np.save(self.tmp_folder + "motif.normalized.npy", self.motif_matrix) - np.save(self.tmp_folder + "ppi.normalized.npy", self.ppi_matrix) - - - # delete expression data - del self.expression_data - - # ===================================================================== - # Running PANDA algorithm - # ===================================================================== - if self.motif_data is not None: - print("Running PANDA algorithm ...") - self.panda_network = self.panda_loop( - self.correlation_matrix, - self.motif_matrix, - self.ppi_matrix, - computing, - alpha, - ) - # label dataframe - self.panda_network = pd.DataFrame( - self.panda_network, index=self.tfs, columns=self.genes - ) - else: - self.panda_network = self.correlation_matrix - self.__pearson_results_data_frame() - # label dataframe - self.panda_network = pd.DataFrame( - self.panda_network, index=self.genes, columns=self.genes - ) def __remove_missing(self): """ Removes the genes and TFs that are not present in one of the priors. Works only if modeProcess='legacy'. @@ -326,82 +329,26 @@ def processData( # Data loading # ===================================================================== ### Loading Motif - if type(motif_file) is str: - # If motif_file is a filename - with Timer("Loading motif data ..."): - self.motif_data = pd.read_csv(motif_file, sep="\t", header=None) - self.motif_tfs = sorted(set(self.motif_data[0])) - self.motif_genes = sorted(set(self.motif_data[1])) - # self.num_tfs = len(self.unique_tfs) - # print('Unique TFs:', self.num_tfs) - elif type(motif_file) is not str: - # If motif_file is an object - if motif_file is None: - # Computation without motif - self.motif_data = None - self.motif_genes = [] - self.motif_tfs = [] - else: - # If motif_file is an object, it needs to be a dataframe - if not isinstance(motif_file, pd.DataFrame): - raise Exception( - "Please provide a pandas dataframe for motif data with column names as 'source', 'target', and 'weight'." - ) - if ("source" not in motif_file.columns) or ( - "target" not in motif_file.columns - ): - print('renaming motif columns to "source", "target" and "weight" ') - motif_file.columns = ["source", "target", "weight"] - self.motif_data = pd.DataFrame(motif_file.values) - self.motif_tfs = sorted(set(motif_file["source"])) - self.motif_genes = sorted(set(motif_file["target"])) - # self.num_tfs = len(self.unique_tfs) - # print('Unique TFs:', self.num_tfs) - + with Timer("Loading motif data ..."): + self.motif_data, self.motif_tfs, self.motif_genes = io.load_motif(motif_file) + ### Loading expression - if type(expression_file) is str: - # If we pass an expression file, check if we have a 'with header' flag and read it - with Timer("Loading expression data ..."): - if with_header: - # Read data with header - self.expression_data = pd.read_csv( - expression_file, sep="\t", index_col=0 - ) - else: - self.expression_data = pd.read_csv( - expression_file, sep="\t", header=None, index_col=0 - ) - # assign expression data and samples/gene names - self.expression_data = self.expression_data.iloc[:, (start-1):end] - self.expression_genes = self.expression_data.index.tolist() - self.expression_samples = self.expression_data.columns.astype(str) - # self.num_genes = len(self.gene_names) - # print('Expression matrix:', self.expression_data.shape) - elif type(expression_file) is not str: - # Pass expression as a dataframe - if expression_file is not None: - if not isinstance(expression_file, pd.DataFrame): - raise Exception( - "Please provide a pandas dataframe for expression data." - ) - self.expression_data = expression_file.iloc[:, (start-1):end] # pd.read_csv(expression_file, sep='\t', header=None, index_col=0) - self.expression_genes = self.expression_data.index.tolist() - self.expression_samples = self.expression_data.columns.astype(str) - # self.num_genes = len(self.gene_names) - # print('Expression matrix:', self.expression_data.shape) - else: - # If no expression is passed - self.gene_names = self.motif_genes - self.expression_genes = self.motif_genes - self.num_genes = len(self.gene_names) - self.expression_data = ( - None # pd.DataFrame(np.identity(self.num_genes, dtype=int)) - ) - print( - # TODO: Marouen check here. Here we do not pass the identity matrix - "No Expression data given: correlation matrix will be an identity matrix of size", - len(self.motif_genes), - ) + with Timer("Loading expression data ..."): + self.expression_data, self.expression_genes, self.expression_samples = io.load_expression(expression_file, with_header = with_header, start = start, end = end) + + if ((self.expression_data is None) & (self.expression_genes is None) & (self.expression_samples is None)): + # If no expression is passed + self.gene_names = self.motif_genes + self.expression_genes = self.motif_genes + self.num_genes = len(self.gene_names) + self.expression_data = ( + None # pd.DataFrame(np.identity(self.num_genes, dtype=int)) + ) + print( + # TODO: Marouen check here. Here we do not pass the identity matrix + "No Expression data given: correlation matrix will be an identity matrix of size", + len(self.motif_genes), + ) if len(self.expression_genes) != len(np.unique(self.expression_genes)): print( @@ -409,6 +356,7 @@ def processData( ) ### Loading the PPI + # #TODO: move this to io if type(ppi_file) is str: with Timer("Loading PPI data ..."): self.ppi_data = pd.read_csv(ppi_file, sep="\t", header=None)