diff --git a/nicheformer-data/pyproject.toml b/nicheformer-data/pyproject.toml new file mode 100644 index 00000000..620d677e --- /dev/null +++ b/nicheformer-data/pyproject.toml @@ -0,0 +1,132 @@ +[build-system] +build-backend = "hatchling.build" +requires = ["hatchling"] + +[project] +name = "nicheformer-data" +version = "0.0.1" +description = "Data collection for nicheformer" +readme = "README.md" +requires-python = ">=3.9" +license = {file = "LICENSE"} +authors = [ + {name = "theislab"}, +] +maintainers = [ + {name = "theislab", email = "theislab@helmholtz-munich.de"}, +] +urls.Documentation = "https://nicheformer-data.readthedocs.io/" +urls.Source = "https://github.com/theislab/nicheformer-data" +urls.Home-page = "https://github.com/theislab/nicheformer-data" +dependencies = [ + "anndata", + "scanpy", + "lamindb[zarr,aws,bionty,jupyter]==0.63.5", + "cellxgene-schema>=3.1.3", + # for debug logging (referenced from the issue template) + "session-info" +] + +[project.optional-dependencies] +dev = [ + "pre-commit", + "twine>=4.0.2" +] +doc = [ + "docutils>=0.8,!=0.18.*,!=0.19.*", + "sphinx>=4", + "sphinx-book-theme>=1.0.0", + "myst-nb", + "sphinxcontrib-bibtex>=1.0.0", + "sphinx-autodoc-typehints", + "sphinxext-opengraph", + # For notebooks + "ipykernel", + "ipython", + "sphinx-copybutton", +] +test = [ + "pytest", + "pytest-cov", +] + +[tool.coverage.run] +source = ["nicheformer_data"] +omit = [ + "**/test_*.py", +] + +[tool.pytest.ini_options] +testpaths = ["tests"] +xfail_strict = true +addopts = [ + "--import-mode=importlib", # allow using test files with same name +] + +[tool.black] +line-length = 120 + +[tool.ruff] +src = ["src"] +line-length = 120 +select = [ + "F", # Errors detected by Pyflakes + "E", # Error detected by Pycodestyle + "W", # Warning detected by Pycodestyle + "I", # isort + "D", # pydocstyle + "B", # flake8-bugbear + "TID", # flake8-tidy-imports + "C4", # flake8-comprehensions + "BLE", # flake8-blind-except + "UP", # pyupgrade + "RUF100", # Report unused noqa directives +] +ignore = [ + # line too long -> we accept long comment lines; black gets rid of long code lines + "E501", + # Do not assign a lambda expression, use a def -> lambda expression assignments are convenient + "E731", + # allow I, O, l as variable names -> I is the identity matrix + "E741", + # Missing docstring in public package + "D104", + # Missing docstring in public module + "D100", + # Missing docstring in __init__ + "D107", + # Errors from function calls in argument defaults. These are fine when the result is immutable. + "B008", + # __magic__ methods are are often self-explanatory, allow missing docstrings + "D105", + # first line should end with a period [Bug: doesn't work with single-line docstrings] + "D400", + # First line should be in imperative mood; try rephrasing + "D401", + ## Disable one in each pair of mutually incompatible rules + # We don’t want a blank line before a class docstring + "D203", + # We want docstrings to start immediately after the opening triple quote + "D213", +] + +[tool.ruff.pydocstyle] +convention = "numpy" + +[tool.ruff.per-file-ignores] +"docs/*" = ["I"] +"tests/*" = ["D"] +"*/__init__.py" = ["F401"] +"scripts/*.py" = ["D","BLE","I", "E"] + +[tool.cruft] +skip = [ + "tests", + "src/**/__init__.py", + "src/**/basic.py", + "docs/api.md", + "docs/changelog.md", + "docs/references.bib", + "docs/references.md", + "docs/notebooks/example.ipynb" +] diff --git a/pertpy/metadata/_cell_line.py b/pertpy/metadata/_cell_line.py index cf4f7bd2..84d40565 100644 --- a/pertpy/metadata/_cell_line.py +++ b/pertpy/metadata/_cell_line.py @@ -632,20 +632,19 @@ def correlate( Returns: Returns pearson correlation coefficients and their corresponding p-values for matched and unmatched cell lines separately. """ - if metadata_key not in adata.obsm: raise ValueError("The metadata can not be found in adata.obsm") if identifier not in adata.obs: raise ValueError("The identifier can not be found in adata.obs") if adata.X.shape[1] != adata.obsm[metadata_key].shape[1]: raise ValueError( - "The dimensions of adata.X do not match those of metadata, please make sure that they have the same gene list." + "Dimensions of adata.X do not match those of metadata. Ensure that they have the same gene list." ) if isinstance(adata.obsm[metadata_key], pd.DataFrame): # Give warning if the genes are not the same if sum(adata.obsm[metadata_key].columns != adata.var.index.values) > 0: print( - "The column name of metadata is not the same as the index of adata.var, please make sure the genes are in the same order." + "Column name of metadata is not the same as the index of adata.var. Ensure that the genes are in the same order." ) # Divide cell lines into those are present and not present in the metadata @@ -669,3 +668,95 @@ def correlate( new_corr = new_pvals = None return corr, pvals, new_corr, new_pvals + + def plot_correlation( + self, + adata: AnnData, + corr: pd.DataFrame, + pval: pd.DataFrame, + identifier: str = "DepMap_ID", + metadata_key: str = "bulk_rna_broad", + category: str = "cell line", + subset_identifier: str | int | Iterable[str] | Iterable[int] | None = None, + ) -> None: + """Visualise the correlation of cell lines with annotated metadata. + + Args: + adata: Input data object. + corr: Pearson correlation scores. If not available, please call the function `pt.md.CellLine.correlate()` first. + pval: P-values for pearson correlation. If not available, please call the function `pt.md.CellLine.correlate()` first. + identifier: Column in `.obs` containing the identifiers. Defaults to 'DepMap_ID'. + metadata_key: Key of the AnnData obsm for comparison with the X matrix. Defaults to 'bulk_rna_broad'. + category: The category for correlation comparison. Defaults to "cell line". + subset_identifier: Selected identifiers for scatter plot visualization between the X matrix and `metadata_key`. + If not None, only the chosen cell line will be plotted, either specified as a value in `identifier` (string) or as an index number. + If None, all cell lines will be plotted. + Defaults to None. + Returns: + Pearson correlation coefficients and their corresponding p-values for matched and unmatched cell lines separately. + """ + if category == "cell line": + if subset_identifier is None: + annotation = "\n".join( + ( + f"Mean pearson correlation: {np.mean(np.diag(corr)):.4f}", + f"Mean p-value: {np.mean(np.diag(pval)):.4f}", + ) + ) + plt.scatter(x=adata.obsm[metadata_key], y=adata.X) + plt.xlabel(metadata_key) + plt.ylabel("Baseline") + else: + subset_identifier_list = ( + [subset_identifier] if isinstance(subset_identifier, (str, int)) else list(subset_identifier) + ) + + if all(isinstance(id, int) and 0 <= id < adata.n_obs for id in subset_identifier_list): + # Visualize the chosen cell line at the given index + subset_identifier_list = adata.obs[identifier].values[subset_identifier_list] + elif not all(isinstance(id, str) for id in subset_identifier_list) or not set( + subset_identifier_list + ).issubset(adata.obs[identifier].unique()): + # The chosen cell line must be found in `identifier` + raise ValueError( + "`Subset_identifier` must contain either all strings or all integers within the index." + ) + + plt.scatter( + x=adata.obsm[metadata_key].loc[subset_identifier_list], + y=adata[adata.obs[identifier].isin(subset_identifier_list)].X, + ) + plt.xlabel( + f"{metadata_key}: {subset_identifier_list[0]}" + if len(subset_identifier_list) == 1 + else f"{metadata_key}" + ) + plt.ylabel(f"Baseline: {subset_identifier_list[0]}" if len(subset_identifier_list) == 1 else "Baseline") + + # Annotate with the correlation coefficient and p-value of the chosen cell lines + subset_cor = np.mean(np.diag(corr.loc[subset_identifier_list, subset_identifier_list])) + subset_pval = np.mean(np.diag(pval.loc[subset_identifier_list, subset_identifier_list])) + annotation = "\n".join( + ( + f"Pearson correlation: {subset_cor:.4f}", + f"P-value: {subset_pval:.4f}", + ) + ) + + plt.text( + 0.05, + 0.95, + annotation, + fontsize=10, + transform=plt.gca().transAxes, + verticalalignment="top", + bbox={ + "boxstyle": "round", + "alpha": 0.5, + "facecolor": "white", + "edgecolor": "black", + }, + ) + plt.show() + else: + raise NotImplementedError diff --git a/pertpy/metadata/_compound.py b/pertpy/metadata/_compound.py index 6c9e4232..1442921e 100644 --- a/pertpy/metadata/_compound.py +++ b/pertpy/metadata/_compound.py @@ -34,13 +34,12 @@ def annotate_compounds( ) -> AnnData: """Fetch compound annotation from pubchempy. - For each cell, we fetch compound annotation via pubchempy. - Args: adata: The data object to annotate. - query_id: The column of `.obs` with compound identifiers. Defaults to "perturbation". - query_id_type: The type of compound identifiers, 'name' or 'cid'. Defaults to "name". - verbosity: The number of unmatched identifiers to print, can be either non-negative values or "all". Defaults to 5. + query_id: The column of `.obs` with compound identifiers. Defaults to 'perturbation'. + query_id_type: The type of compound identifiers, 'name' or 'cid'. Defaults to 'name'. + verbosity: The number of unmatched identifiers to print, can be either non-negative values or "all". + Defaults to 5. copy: Determines whether a copy of the `adata` is returned. Defaults to False. Returns: @@ -50,7 +49,7 @@ def annotate_compounds( adata = adata.copy() if query_id not in adata.obs.columns: - raise ValueError(f"The requested query_id {query_id} is not in `adata.obs`. \n" "Please check again. ") + raise ValueError(f"The requested query_id {query_id} is not in `adata.obs`.\n" "Please check again. ") query_dict = {} not_matched_identifiers = [] diff --git a/pertpy/metadata/_look_up.py b/pertpy/metadata/_look_up.py index 382fcf2e..04e3d3ea 100644 --- a/pertpy/metadata/_look_up.py +++ b/pertpy/metadata/_look_up.py @@ -24,10 +24,10 @@ def __init__( ): """ Args: - type: metadata type for annotation, cell_line, compound or moa. Defaults to cell_line. - transfer_metadata: dataframes used to generate Lookup object. This is currently set to None for CompoundMetaData which does not require any dataframes for transfer. Defaults to None. To ensure efficient transfer of - metadata during initialization, LookUp object should always be generated by the corresponding MetaData - class. Also, different MetaData classes have different required metadata to transfer. + type: Metadata type for annotation. One of 'cell_line', 'compound' or 'moa'. Defaults to cell_line. + transfer_metadata: DataFrames used to generate Lookup object. + This is currently set to None for CompoundMetaData which does not require any dataframes for transfer. + Defaults to 'cell_line'. """ if type == "cell_line": self.type = type @@ -245,7 +245,6 @@ def available_cell_lines( "stripped_cell_line_name". Defaults to "DepMap_ID". query_id_list: Unique cell line identifiers to test the number of matched ids present in the metadata. If set to None, the query of metadata identifiers will be disabled. Defaults to None. - """ if self.type != "cell_line": raise ValueError("This is not a LookUp object specifically for CellLineMetaData!") @@ -336,12 +335,20 @@ def available_drug_response( """A brief summary of drug response data. Args: - gdsc_dataset: The GDSC dataset, 1 or 2. Defaults to 1. The GDSC1 dataset updates previous releases with additional drug screening data from the Wellcome Sanger Institute and Massachusetts General Hospital. It covers 970 Cell lines and 403 Compounds with 333292 IC50s. GDSC2 is new and has 243,466 IC50 results from the latest screening at the Wellcome Sanger Institute using improved experimental procedures. - reference_id: The type of cell line identifier in the meta data, cell_line_name, sanger_model_id or cosmic_id. Defaults to "cell_line_name". - query_id_list: Unique cell line identifiers to test the number of matched ids present in the metadata. If set to None, the query of metadata identifiers will be disabled. Defaults to None. - reference_perturbation: The perturbation information in the meta data, drug_name or drug_id. Defaults to "drug_name". - query_perturbation_list: Unique perturbation types to test the number of matched ones present in the metadata. If set to None, the query of perturbation types will be disabled. Defaults to None. - + gdsc_dataset: The GDSC dataset, 1 or 2. Defaults to 1. + The GDSC1 dataset updates previous releases with additional drug screening data from the Wellcome Sanger Institute and Massachusetts General Hospital. + It covers 970 Cell lines and 403 Compounds with 333292 IC50s. + GDSC2 is new and has 243,466 IC50 results from the latest screening at the Wellcome Sanger Institute using improved experimental procedures. + reference_id: The type of cell line identifier in the meta data, cell_line_name, sanger_model_id or cosmic_id. + Defaults to 'cell_line_name'. + query_id_list: Unique cell line identifiers to test the number of matched ids present in the metadata. + If set to None, the query of metadata identifiers will be disabled. + Defaults to None. + reference_perturbation: The perturbation information in the meta data, drug_name or drug_id. + Defaults to 'drug_name'. + query_perturbation_list: Unique perturbation types to test the number of matched ones present in the metadata. + If set to None, the query of perturbation types will be disabled. + Defaults to None. """ if self.type != "cell_line": raise ValueError("This is not a LookUp object specific for CellLineMetaData!") @@ -378,8 +385,10 @@ def available_genes_annotation( """A brief summary of gene annotation metadata Args: - reference_id: The type of gene identifier in the meta data, gene_id, ensembl_gene_id, hgnc_id, hgnc_symbol. Defaults to "ensembl_gene_id". - query_id_list: Unique gene identifiers to test the number of matched ids present in the metadata. Defaults to None. + reference_id: The type of gene identifier in the meta data, gene_id, ensembl_gene_id, hgnc_id, hgnc_symbol. + Defaults to "ensembl_gene_id". + query_id_list: Unique gene identifiers to test the number of matched ids present in the metadata. + Defaults to None. """ if self.type != "cell_line": raise ValueError("This is not a LookUp object specific for CellLineMetaData!") @@ -415,8 +424,12 @@ def available_moa( """A brief summary of MoA annotation. Args: - query_id_list: Unique perturbagens to test the number of matched ones present in the metadata. If set to None, the query of metadata perturbagens will be disabled. Defaults to None. - target_list: Unique molecular targets to test the number of matched ones present in the metadata. If set to None, the comparison of molecular targets in the query of metadata perturbagens will be disabled. Defaults to None. + query_id_list: Unique perturbagens to test the number of matched ones present in the metadata. + If set to None, the query of metadata perturbagens will be disabled. + Defaults to None. + target_list: Unique molecular targets to test the number of matched ones present in the metadata. + If set to None, the comparison of molecular targets in the query of metadata perturbagens will be disabled. + Defaults to None. """ if self.type != "moa": raise ValueError("This is not a LookUp object specific for MoaMetaData!") @@ -443,8 +456,10 @@ def available_compounds( """A brief summary of compound annotation. Args: - query_id_list: Unique compounds to test the number of matched ones present in the metadata. If set to None, query of compound identifiers will be disabled. Defaults to None. - query_id_type: The type of compound identifiers, name or cid. Defaults to "name". + query_id_list: Unique compounds to test the number of matched ones present in the metadata. + If set to None, query of compound identifiers will be disabled. + Defaults to None. + query_id_type: The type of compound identifiers, name or cid. Defaults to 'name'. """ if self.type != "compound": raise ValueError("This is not a LookUp object specific for CompoundData!") diff --git a/pertpy/metadata/_metadata.py b/pertpy/metadata/_metadata.py index 6657c576..d1ae6c64 100644 --- a/pertpy/metadata/_metadata.py +++ b/pertpy/metadata/_metadata.py @@ -7,9 +7,6 @@ class MetaData: - def __init__(self): - pass - def _warn_unmatch( self, total_identifiers: int, diff --git a/pertpy/metadata/_moa.py b/pertpy/metadata/_moa.py index 534ac1f9..5238777d 100644 --- a/pertpy/metadata/_moa.py +++ b/pertpy/metadata/_moa.py @@ -1,7 +1,7 @@ from __future__ import annotations from pathlib import Path -from typing import TYPE_CHECKING, Literal, Union +from typing import TYPE_CHECKING import numpy as np import pandas as pd @@ -52,8 +52,10 @@ def annotate_moa( Args: adata: The data object to annotate. query_id: The column of `.obs` with the name of a perturbagen. Defaults to "pert_iname". - target: The column of `.obs` with target information. If set to None, all MoAs are retrieved without comparing molecular targets. Defaults to None. - verbosity: The number of unmatched identifiers to print, can be either non-negative values or "all". Defaults to 5. + target: The column of `.obs` with target information. If set to None, all MoAs are retrieved without comparing molecular targets. + Defaults to None. + verbosity: The number of unmatched identifiers to print, can be either non-negative values or "all". + Defaults to 5. copy: Determines whether a copy of the `adata` is returned. Defaults to False. Returns: @@ -89,11 +91,8 @@ def annotate_moa( .drop("key_0", axis=1) ) - # If target column is given, - # we check whether it is one of the targets listed in the metadata - # If inconsistent, we treat this perturbagen as unmatched and - # overwrite the annotated metadata with NaN - + # If target column is given, check whether it is one of the targets listed in the metadata + # If inconsistent, treat this perturbagen as unmatched and overwrite the annotated metadata with NaN if target is not None: target_meta = "target" if target != "target" else "target_fromMeta" adata.obs[target_meta] = adata.obs[target_meta].mask( @@ -102,10 +101,8 @@ def annotate_moa( pertname_meta = "pert_iname" if query_id != "pert_iname" else "pert_iname_fromMeta" adata.obs.loc[adata.obs[target_meta].isna(), [pertname_meta, "moa"]] = np.nan - # If query_id and reference_id have different names, - # there will be a column for each of them after merging, - # which is redundant as they refer to the same information. - # We will move the reference_id column. + # If query_id and reference_id have different names, there will be a column for each of them after merging + # which is redundant as they refer to the same information. Then move the reference_id column. if query_id != "pert_iname": del adata.obs["pert_iname"] diff --git a/pertpy/plot/__init__.py b/pertpy/plot/__init__.py index 94830fe7..36be5496 100644 --- a/pertpy/plot/__init__.py +++ b/pertpy/plot/__init__.py @@ -1,6 +1,5 @@ from pertpy.plot._augur import AugurpyPlot as ag from pertpy.plot._coda import CodaPlot as coda from pertpy.plot._guide_rna import GuideRnaPlot as guide -from pertpy.plot._metadata import MetaDataPlot as md from pertpy.plot._milopy import MilopyPlot as milo from pertpy.plot._mixscape import MixscapePlot as ms diff --git a/pertpy/plot/_metadata.py b/pertpy/plot/_metadata.py deleted file mode 100644 index f8f64e09..00000000 --- a/pertpy/plot/_metadata.py +++ /dev/null @@ -1,111 +0,0 @@ -from __future__ import annotations - -import warnings -from typing import TYPE_CHECKING, Union - -if TYPE_CHECKING: - from collections.abc import Iterable - - import pandas as pd - -import numpy as np -from matplotlib import pyplot as plt - -if TYPE_CHECKING: - from anndata import AnnData - - -class MetaDataPlot: - """Plotting functions for Metadata.""" - - @staticmethod - def plot_correlation( - adata: AnnData, - corr: pd.DataFrame, - pval: pd.DataFrame, - identifier: str = "DepMap_ID", - metadata_key: str = "bulk_rna_broad", - category: str = "cell line", - subset_identifier: str | int | Iterable[str] | Iterable[int] | None = None, - ) -> None: - """Visualise the correlation of cell lines with annotated metadata. - - Args: - adata: Input data object. - corr: Pearson correlation scores. If not available, please call the function `pt.md.CellLine.correlate()` first. - pval: P-values for pearson correlation. If not available, please call the function `pt.md.CellLine.correlate()` first. - identifier: Column in `.obs` containing the identifiers. Defaults to "DepMap_ID". - metadata_key: Key of the AnnData obsm for comparison with the X matrix. Defaults to "bulk_rna_broad". - category: The category for correlation comparison. Defaults to "cell line". - subset_identifier: Selected identifiers for scatter plot visualization between the X matrix and `metadata_key`. - If None, all cell lines will be plotted. - If not None, only the chosen cell line will be plotted, either speficied as a value in `identifier` (string) or as an index number. - Defaults to None. - Returns: - Pearson correlation coefficients and their corresponding p-values for matched and unmatched cell lines separately. - """ - if category == "cell line": - if subset_identifier is None: - annotation = "\n".join( - ( - f"Mean pearson correlation: {np.mean(np.diag(corr)):.4f}", - f"Mean p-value: {np.mean(np.diag(pval)):.4f}", - ) - ) - plt.scatter(x=adata.obsm[metadata_key], y=adata.X) - plt.xlabel(metadata_key) - plt.ylabel("Baseline") - else: - subset_identifier_list = ( - [subset_identifier] if isinstance(subset_identifier, (str, int)) else list(subset_identifier) - ) - - if all(isinstance(id, int) and 0 <= id < adata.n_obs for id in subset_identifier_list): - # Visualize the chosen cell line at the given index - subset_identifier_list = adata.obs[identifier].values[subset_identifier_list] - elif not all(isinstance(id, str) for id in subset_identifier_list) or not set( - subset_identifier_list - ).issubset(adata.obs[identifier].unique()): - # The chosen cell line must be found in `identifier` - raise ValueError( - "`Subset_identifier` must contain either all strings or all integers within the index." - ) - - plt.scatter( - x=adata.obsm[metadata_key].loc[subset_identifier_list], - y=adata[adata.obs[identifier].isin(subset_identifier_list)].X, - ) - plt.xlabel( - f"{metadata_key}: {subset_identifier_list[0]}" - if len(subset_identifier_list) == 1 - else f"{metadata_key}" - ) - plt.ylabel(f"Baseline: {subset_identifier_list[0]}" if len(subset_identifier_list) == 1 else "Baseline") - - # Annotate with the correlation coefficient and p-value of the chosen cell lines - subset_cor = np.mean(np.diag(corr.loc[subset_identifier_list, subset_identifier_list])) - subset_pval = np.mean(np.diag(pval.loc[subset_identifier_list, subset_identifier_list])) - annotation = "\n".join( - ( - f"Pearson correlation: {subset_cor:.4f}", - f"P-value: {subset_pval:.4f}", - ) - ) - - plt.text( - 0.05, - 0.95, - annotation, - fontsize=10, - transform=plt.gca().transAxes, - verticalalignment="top", - bbox={ - "boxstyle": "round", - "alpha": 0.5, - "facecolor": "white", - "edgecolor": "black", - }, - ) - plt.show() - else: - raise NotImplementedError