Refactoring and move plot

Signed-off-by: zethson <[email protected]>
scverse · Dec 20, 2023 · 7c5c81a · 7c5c81a
1 parent c36dacf
commit 7c5c81a
Show file tree

Hide file tree

Showing 8 changed files with 272 additions and 153 deletions.
diff --git a/nicheformer-data/pyproject.toml b/nicheformer-data/pyproject.toml
@@ -0,0 +1,132 @@
+[build-system]
+build-backend = "hatchling.build"
+requires = ["hatchling"]
+
+[project]
+name = "nicheformer-data"
+version = "0.0.1"
+description = "Data collection for nicheformer"
+readme = "README.md"
+requires-python = ">=3.9"
+license = {file = "LICENSE"}
+authors = [
+    {name = "theislab"},
+]
+maintainers = [
+    {name = "theislab", email = "[email protected]"},
+]
+urls.Documentation = "https://nicheformer-data.readthedocs.io/"
+urls.Source = "https://github.com/theislab/nicheformer-data"
+urls.Home-page = "https://github.com/theislab/nicheformer-data"
+dependencies = [
+    "anndata",
+    "scanpy",
+    "lamindb[zarr,aws,bionty,jupyter]==0.63.5",
+    "cellxgene-schema>=3.1.3",
+    # for debug logging (referenced from the issue template)
+    "session-info"
+]
+
+[project.optional-dependencies]
+dev = [
+    "pre-commit",
+    "twine>=4.0.2"
+]
+doc = [
+    "docutils>=0.8,!=0.18.*,!=0.19.*",
+    "sphinx>=4",
+    "sphinx-book-theme>=1.0.0",
+    "myst-nb",
+    "sphinxcontrib-bibtex>=1.0.0",
+    "sphinx-autodoc-typehints",
+    "sphinxext-opengraph",
+    # For notebooks
+    "ipykernel",
+    "ipython",
+    "sphinx-copybutton",
+]
+test = [
+    "pytest",
+    "pytest-cov",
+]
+
+[tool.coverage.run]
+source = ["nicheformer_data"]
+omit = [
+    "**/test_*.py",
+]
+
+[tool.pytest.ini_options]
+testpaths = ["tests"]
+xfail_strict = true
+addopts = [
+    "--import-mode=importlib",  # allow using test files with same name
+]
+
+[tool.black]
+line-length = 120
+
+[tool.ruff]
+src = ["src"]
+line-length = 120
+select = [
+    "F",  # Errors detected by Pyflakes
+    "E",  # Error detected by Pycodestyle
+    "W",  # Warning detected by Pycodestyle
+    "I",  # isort
+    "D",  # pydocstyle
+    "B",  # flake8-bugbear
+    "TID",  # flake8-tidy-imports
+    "C4",  # flake8-comprehensions
+    "BLE",  # flake8-blind-except
+    "UP",  # pyupgrade
+    "RUF100",  # Report unused noqa directives
+]
+ignore = [
+    # line too long -> we accept long comment lines; black gets rid of long code lines
+    "E501",
+    # Do not assign a lambda expression, use a def -> lambda expression assignments are convenient
+    "E731",
+    # allow I, O, l as variable names -> I is the identity matrix
+    "E741",
+    # Missing docstring in public package
+    "D104",
+    # Missing docstring in public module
+    "D100",
+    # Missing docstring in __init__
+    "D107",
+    # Errors from function calls in argument defaults. These are fine when the result is immutable.
+    "B008",
+    # __magic__ methods are are often self-explanatory, allow missing docstrings
+    "D105",
+    # first line should end with a period [Bug: doesn't work with single-line docstrings]
+    "D400",
+    # First line should be in imperative mood; try rephrasing
+    "D401",
+    ## Disable one in each pair of mutually incompatible rules
+    # We don’t want a blank line before a class docstring
+    "D203",
+    # We want docstrings to start immediately after the opening triple quote
+    "D213",
+]
+
+[tool.ruff.pydocstyle]
+convention = "numpy"
+
+[tool.ruff.per-file-ignores]
+"docs/*" = ["I"]
+"tests/*" = ["D"]
+"*/__init__.py" = ["F401"]
+"scripts/*.py" = ["D","BLE","I", "E"]
+
+[tool.cruft]
+skip = [
+    "tests",
+    "src/**/__init__.py",
+    "src/**/basic.py",
+    "docs/api.md",
+    "docs/changelog.md",
+    "docs/references.bib",
+    "docs/references.md",
+    "docs/notebooks/example.ipynb"
+]
diff --git a/pertpy/metadata/_cell_line.py b/pertpy/metadata/_cell_line.py
@@ -632,20 +632,19 @@ def correlate(
         Returns:
             Returns pearson correlation coefficients and their corresponding p-values for matched and unmatched cell lines separately.
         """
-
         if metadata_key not in adata.obsm:
             raise ValueError("The metadata can not be found in adata.obsm")
         if identifier not in adata.obs:
             raise ValueError("The identifier can not be found in adata.obs")
         if adata.X.shape[1] != adata.obsm[metadata_key].shape[1]:
             raise ValueError(
-                "The dimensions of adata.X do not match those of metadata, please make sure that they have the same gene list."
+                "Dimensions of adata.X do not match those of metadata. Ensure that they have the same gene list."
             )
         if isinstance(adata.obsm[metadata_key], pd.DataFrame):
             # Give warning if the genes are not the same
             if sum(adata.obsm[metadata_key].columns != adata.var.index.values) > 0:
                 print(
-                    "The column name of metadata is not the same as the index of adata.var, please make sure the genes are in the same order."
+                    "Column name of metadata is not the same as the index of adata.var. Ensure that the genes are in the same order."
                 )
 
         # Divide cell lines into those are present and not present in the metadata
@@ -669,3 +668,95 @@ def correlate(
             new_corr = new_pvals = None
 
         return corr, pvals, new_corr, new_pvals
+
+    def plot_correlation(
+        self,
+        adata: AnnData,
+        corr: pd.DataFrame,
+        pval: pd.DataFrame,
+        identifier: str = "DepMap_ID",
+        metadata_key: str = "bulk_rna_broad",
+        category: str = "cell line",
+        subset_identifier: str | int | Iterable[str] | Iterable[int] | None = None,
+    ) -> None:
+        """Visualise the correlation of cell lines with annotated metadata.
+
+        Args:
+            adata: Input data object.
+            corr: Pearson correlation scores. If not available, please call the function `pt.md.CellLine.correlate()` first.
+            pval: P-values for pearson correlation. If not available, please call the function `pt.md.CellLine.correlate()` first.
+            identifier: Column in `.obs` containing the identifiers. Defaults to 'DepMap_ID'.
+            metadata_key: Key of the AnnData obsm for comparison with the X matrix. Defaults to 'bulk_rna_broad'.
+            category: The category for correlation comparison. Defaults to "cell line".
+            subset_identifier: Selected identifiers for scatter plot visualization between the X matrix and `metadata_key`.
+                              If not None, only the chosen cell line will be plotted, either specified as a value in `identifier` (string) or as an index number.
+                              If None, all cell lines will be plotted.
+                              Defaults to None.
+        Returns:
+            Pearson correlation coefficients and their corresponding p-values for matched and unmatched cell lines separately.
+        """
+        if category == "cell line":
+            if subset_identifier is None:
+                annotation = "\n".join(
+                    (
+                        f"Mean pearson correlation: {np.mean(np.diag(corr)):.4f}",
+                        f"Mean p-value: {np.mean(np.diag(pval)):.4f}",
+                    )
+                )
+                plt.scatter(x=adata.obsm[metadata_key], y=adata.X)
+                plt.xlabel(metadata_key)
+                plt.ylabel("Baseline")
+            else:
+                subset_identifier_list = (
+                    [subset_identifier] if isinstance(subset_identifier, (str, int)) else list(subset_identifier)
+                )
+
+                if all(isinstance(id, int) and 0 <= id < adata.n_obs for id in subset_identifier_list):
+                    # Visualize the chosen cell line at the given index
+                    subset_identifier_list = adata.obs[identifier].values[subset_identifier_list]
+                elif not all(isinstance(id, str) for id in subset_identifier_list) or not set(
+                    subset_identifier_list
+                ).issubset(adata.obs[identifier].unique()):
+                    # The chosen cell line must be found in `identifier`
+                    raise ValueError(
+                        "`Subset_identifier` must contain either all strings or all integers within the index."
+                    )
+
+                plt.scatter(
+                    x=adata.obsm[metadata_key].loc[subset_identifier_list],
+                    y=adata[adata.obs[identifier].isin(subset_identifier_list)].X,
+                )
+                plt.xlabel(
+                    f"{metadata_key}: {subset_identifier_list[0]}"
+                    if len(subset_identifier_list) == 1
+                    else f"{metadata_key}"
+                )
+                plt.ylabel(f"Baseline: {subset_identifier_list[0]}" if len(subset_identifier_list) == 1 else "Baseline")
+
+                # Annotate with the correlation coefficient and p-value of the chosen cell lines
+                subset_cor = np.mean(np.diag(corr.loc[subset_identifier_list, subset_identifier_list]))
+                subset_pval = np.mean(np.diag(pval.loc[subset_identifier_list, subset_identifier_list]))
+                annotation = "\n".join(
+                    (
+                        f"Pearson correlation: {subset_cor:.4f}",
+                        f"P-value: {subset_pval:.4f}",
+                    )
+                )
+
+            plt.text(
+                0.05,
+                0.95,
+                annotation,
+                fontsize=10,
+                transform=plt.gca().transAxes,
+                verticalalignment="top",
+                bbox={
+                    "boxstyle": "round",
+                    "alpha": 0.5,
+                    "facecolor": "white",
+                    "edgecolor": "black",
+                },
+            )
+            plt.show()
+        else:
+            raise NotImplementedError
diff --git a/pertpy/metadata/_compound.py b/pertpy/metadata/_compound.py
@@ -34,13 +34,12 @@ def annotate_compounds(
     ) -> AnnData:
         """Fetch compound annotation from pubchempy.
 
-        For each cell, we fetch compound annotation via pubchempy.
-
         Args:
             adata: The data object to annotate.
-            query_id: The column of `.obs` with compound identifiers. Defaults to "perturbation".
-            query_id_type: The type of compound identifiers, 'name' or 'cid'. Defaults to "name".
-            verbosity: The number of unmatched identifiers to print, can be either non-negative values or "all". Defaults to 5.
+            query_id: The column of `.obs` with compound identifiers. Defaults to 'perturbation'.
+            query_id_type: The type of compound identifiers, 'name' or 'cid'. Defaults to 'name'.
+            verbosity: The number of unmatched identifiers to print, can be either non-negative values or "all".
+                       Defaults to 5.
             copy: Determines whether a copy of the `adata` is returned. Defaults to False.
 
         Returns:
@@ -50,7 +49,7 @@ def annotate_compounds(
             adata = adata.copy()
 
         if query_id not in adata.obs.columns:
-            raise ValueError(f"The requested query_id {query_id} is not in `adata.obs`. \n" "Please check again. ")
+            raise ValueError(f"The requested query_id {query_id} is not in `adata.obs`.\n" "Please check again. ")
 
         query_dict = {}
         not_matched_identifiers = []

diff --git a/pertpy/metadata/_look_up.py b/pertpy/metadata/_look_up.py
@@ -24,10 +24,10 @@ def __init__(
     ):
         """
         Args:
-            type: metadata type for annotation, cell_line, compound or moa. Defaults to cell_line.
-            transfer_metadata: dataframes used to generate Lookup object. This is currently set to None for CompoundMetaData which does not require any dataframes for transfer. Defaults to None. To ensure efficient transfer of
-                metadata during initialization, LookUp object should always be generated by the corresponding MetaData
-                class. Also, different MetaData classes have different required metadata to transfer.
+            type: Metadata type for annotation. One of 'cell_line', 'compound' or 'moa'. Defaults to cell_line.
+            transfer_metadata: DataFrames used to generate Lookup object.
+                               This is currently set to None for CompoundMetaData which does not require any dataframes for transfer.
+                               Defaults to 'cell_line'.
         """
         if type == "cell_line":
             self.type = type
@@ -245,7 +245,6 @@ def available_cell_lines(
                 "stripped_cell_line_name". Defaults to "DepMap_ID".
             query_id_list: Unique cell line identifiers to test the number of matched ids present in the
                 metadata. If set to None, the query of metadata identifiers will be disabled. Defaults to None.
-
         """
         if self.type != "cell_line":
             raise ValueError("This is not a LookUp object specifically for CellLineMetaData!")
@@ -336,12 +335,20 @@ def available_drug_response(
         """A brief summary of drug response data.
 
         Args:
-            gdsc_dataset: The GDSC dataset, 1 or 2. Defaults to 1. The GDSC1 dataset updates previous releases with additional drug screening data from the Wellcome Sanger Institute and Massachusetts General Hospital. It covers 970 Cell lines and 403 Compounds with 333292 IC50s. GDSC2 is new and has 243,466 IC50 results from the latest screening at the Wellcome Sanger Institute using improved experimental procedures.
-            reference_id: The type of cell line identifier in the meta data, cell_line_name, sanger_model_id or cosmic_id. Defaults to "cell_line_name".
-            query_id_list: Unique cell line identifiers to test the number of matched ids present in the metadata. If set to None, the query of metadata identifiers will be disabled. Defaults to None.
-            reference_perturbation: The perturbation information in the meta data, drug_name or drug_id. Defaults to "drug_name".
-            query_perturbation_list: Unique perturbation types to test the number of matched ones present in the metadata. If set to None, the query of perturbation types will be disabled. Defaults to None.
-
+            gdsc_dataset: The GDSC dataset, 1 or 2. Defaults to 1.
+                          The GDSC1 dataset updates previous releases with additional drug screening data from the Wellcome Sanger Institute and Massachusetts General Hospital.
+                          It covers 970 Cell lines and 403 Compounds with 333292 IC50s.
+                          GDSC2 is new and has 243,466 IC50 results from the latest screening at the Wellcome Sanger Institute using improved experimental procedures.
+            reference_id: The type of cell line identifier in the meta data, cell_line_name, sanger_model_id or cosmic_id.
+                          Defaults to 'cell_line_name'.
+            query_id_list: Unique cell line identifiers to test the number of matched ids present in the metadata.
+                           If set to None, the query of metadata identifiers will be disabled.
+                           Defaults to None.
+            reference_perturbation: The perturbation information in the meta data, drug_name or drug_id.
+                                    Defaults to 'drug_name'.
+            query_perturbation_list: Unique perturbation types to test the number of matched ones present in the metadata.
+                                     If set to None, the query of perturbation types will be disabled.
+                                     Defaults to None.
         """
         if self.type != "cell_line":
             raise ValueError("This is not a LookUp object specific for CellLineMetaData!")
@@ -378,8 +385,10 @@ def available_genes_annotation(
         """A brief summary of gene annotation metadata
 
         Args:
-            reference_id: The type of gene identifier in the meta data, gene_id, ensembl_gene_id, hgnc_id, hgnc_symbol. Defaults to "ensembl_gene_id".
-            query_id_list: Unique gene identifiers to test the number of matched ids present in the metadata. Defaults to None.
+            reference_id: The type of gene identifier in the meta data, gene_id, ensembl_gene_id, hgnc_id, hgnc_symbol.
+                          Defaults to "ensembl_gene_id".
+            query_id_list: Unique gene identifiers to test the number of matched ids present in the metadata.
+                           Defaults to None.
         """
         if self.type != "cell_line":
             raise ValueError("This is not a LookUp object specific for CellLineMetaData!")
@@ -415,8 +424,12 @@ def available_moa(
         """A brief summary of MoA annotation.
 
         Args:
-            query_id_list: Unique perturbagens to test the number of matched ones present in the metadata. If set to None, the query of metadata perturbagens will be disabled. Defaults to None.
-            target_list: Unique molecular targets to test the number of matched ones present in the metadata. If set to None, the comparison of molecular targets in the query of metadata perturbagens will be disabled. Defaults to None.
+            query_id_list: Unique perturbagens to test the number of matched ones present in the metadata.
+                           If set to None, the query of metadata perturbagens will be disabled.
+                           Defaults to None.
+            target_list: Unique molecular targets to test the number of matched ones present in the metadata.
+                         If set to None, the comparison of molecular targets in the query of metadata perturbagens will be disabled.
+                         Defaults to None.
         """
         if self.type != "moa":
             raise ValueError("This is not a LookUp object specific for MoaMetaData!")
@@ -443,8 +456,10 @@ def available_compounds(
     """A brief summary of compound annotation.
 
     Args:
-        query_id_list: Unique compounds to test the number of matched ones present in the metadata. If set to None, query of compound identifiers will be disabled. Defaults to None.
-        query_id_type: The type of compound identifiers, name or cid. Defaults to "name".
+        query_id_list: Unique compounds to test the number of matched ones present in the metadata.
+                       If set to None, query of compound identifiers will be disabled.
+                       Defaults to None.
+        query_id_type: The type of compound identifiers, name or cid. Defaults to 'name'.
     """
     if self.type != "compound":
         raise ValueError("This is not a LookUp object specific for CompoundData!")

diff --git a/pertpy/metadata/_metadata.py b/pertpy/metadata/_metadata.py
@@ -7,9 +7,6 @@
 
 
 class MetaData:
-    def __init__(self):
-        pass
-
     def _warn_unmatch(
         self,
         total_identifiers: int,