scverse · Lilly-May · Dec 8, 2023 · Dec 6, 2023 · Dec 6, 2023 · Dec 7, 2023
diff --git a/pertpy/tools/_perturbation_space/_simple.py b/pertpy/tools/_perturbation_space/_simple.py
@@ -18,6 +18,7 @@ def compute(
         target_col: str = "perturbations",
         layer_key: str = None,
         embedding_key: str = "X_umap",
+        keep_obs: str | list[str] = None,
     ) -> AnnData:  # type: ignore
         """Computes the centroids of a pre-computed embedding such as UMAP.
 
@@ -26,6 +27,8 @@ def compute(
             target_col: .obs column that stores the label of the perturbation applied to each cell.
             layer_key: If specified pseudobulk computation is done by using the specified layer. Otherwise, computation is done with .X
             embedding_key: `obsm` key of the AnnData embedding to use for computation. Defaults to the 'X' matrix otherwise.
+            keep_obs: .obs columns in the input AnnData to keep in the output pseudobulk AnnData. Only .obs columns with the same value for
+                each cell of one perturbation can be kept. Defaults to None.
 
         Examples:
             Compute the centroids of a UMAP embedding of the papalexi_2021 dataset:
@@ -59,6 +62,13 @@ def compute(
         if target_col not in adata.obs:
             raise ValueError(f"Obs {target_col!r} does not exist in the .obs attribute.")
 
+        if keep_obs is not None:
+            if isinstance(keep_obs, str):
+                keep_obs = [keep_obs]
+            for obs in keep_obs:
+                if obs not in adata.obs:
+                    raise ValueError(f"Obs {obs!r} does not exist in the .obs attribute.")
+
         grouped = adata.obs.groupby(target_col)
 
         if X is None:
@@ -84,6 +94,21 @@ def compute(
 
         ps_adata = AnnData(X=X)
         ps_adata.obs_names = index
+        ps_adata.obs[target_col] = index
+
+        if embedding_key is not None:
+            ps_adata.obsm[embedding_key] = X
+
+        if keep_obs is not None:  # Save the values of the obs columns of interest in the ps_adata object
+            for obs_name in keep_obs:
+                for pert in index:
+                    obs_value = adata[adata.obs.perturbation_name == pert].obs[obs_name].unique()
+                    if len(obs_value) > 1:
+                        raise ValueError(
+                            f"Obs {obs_name!r} has more than one value for perturbation {pert!r}. "
+                            f"You can only keep obs columns with an unambiguous value for each perturbation."
+                        )
+                    ps_adata.obs.loc[ps_adata.obs.index == pert, obs_name] = obs_value[0]
 
         return ps_adata