diff --git a/graphein/protein/features/nodes/dssp.py b/graphein/protein/features/nodes/dssp.py index 863e4e53..e5341390 100644 --- a/graphein/protein/features/nodes/dssp.py +++ b/graphein/protein/features/nodes/dssp.py @@ -94,7 +94,7 @@ def process_dssp_df(df: pd.DataFrame) -> pd.DataFrame: return df -def add_dssp_df(G: nx.Graph, dssp_config: Optional[DSSPConfig]) -> nx.Graph: +def add_dssp_df(G: nx.Graph, dssp_config: Optional[DSSPConfig], new_pdb_path: str = None) -> nx.Graph: """ Construct DSSP dataframe and add as graph level variable to protein graph @@ -102,12 +102,18 @@ def add_dssp_df(G: nx.Graph, dssp_config: Optional[DSSPConfig]) -> nx.Graph: :param G: nx.Graph :param dssp_config: DSSPConfig object. Specifies which executable to run. Located in graphein.protein.config :type dssp_config: DSSPConfig, optional + :param new_pdb_path: specifies a new path the local pdb file in case it was moved. + :type dssp_config: DSSPConfig, optional :return: Protein graph with DSSP dataframe added :rtype: nx.Graph """ config = G.graph["config"] - pdb_id = G.graph["pdb_id"] + pdb_code = G.graph["pdb_code"] + if new_pdb_path: + pdb_path = new_pdb_path + else: + pdb_path = G.graph["pdb_path"] # Extract DSSP executable executable = dssp_config.executable @@ -117,11 +123,17 @@ def add_dssp_df(G: nx.Graph, dssp_config: Optional[DSSPConfig]) -> nx.Graph: executable ), "DSSP must be on PATH and marked as an executable" - # Check for existence of pdb file. If not, download it. - if not os.path.isfile(config.pdb_dir / (pdb_id + ".pdb")): - pdb_file = download_pdb(config, pdb_id) + if pdb_code: + # Check for existence of pdb file. If not, download it. + if not os.path.isfile(config.pdb_dir / (pdb_code + ".pdb")): + pdb_file = download_pdb(config, pdb_code) + else: + pdb_file = config.pdb_dir / (pdb_code + ".pdb") else: - pdb_file = config.pdb_dir / (pdb_id + ".pdb") + assert os.path.isfile(pdb_path), f"The PDB file could not be found under the path: " \ + f" {G.graph['pdb_path']}. If the file was moved, specify the new " \ + f" path using the parameter new_pdb_path" + pdb_file = pdb_path if config.verbose: print(f"Using DSSP executable '{executable}'") diff --git a/graphein/protein/graphs.py b/graphein/protein/graphs.py index 2ac73c25..fa58929d 100644 --- a/graphein/protein/graphs.py +++ b/graphein/protein/graphs.py @@ -69,8 +69,9 @@ def read_pdb_to_dataframe( :returns: ``pd.DataFrame`` containing protein structure :rtype: pd.DataFrame """ - if pdb_code is None and pdb_path is None: - raise NameError("One of pdb_code or pdb_path must be specified!") + + assert (pdb_code and not pdb_path) or (not pdb_code and pdb_path), 'Either a PDB ID or a path to a local PDB file' \ + ' must be specified to read a PDB' atomic_df = ( PandasPdb().read_pdb(pdb_path) @@ -349,8 +350,10 @@ def select_chains( def initialise_graph_with_metadata( protein_df: pd.DataFrame, raw_pdb_df: pd.DataFrame, - pdb_id: str, granularity: str, + name: Optional[str] = None, + pdb_code: Optional[str] = None, + pdb_path: Optional[str] = None, ) -> nx.Graph: """ Initializes the nx Graph object with initial metadata. @@ -359,17 +362,33 @@ def initialise_graph_with_metadata( :type protein_df: pd.DataFrame :param raw_pdb_df: Unprocessed dataframe of protein structure for comparison and traceability downstream. :type raw_pdb_df: pd.DataFrame - :param pdb_id: PDB Accession code. - :type pdb_id: str :param granularity: Granularity of the graph (eg ``"atom"``, ``"CA"``, ``"CB"`` etc or ``"centroid"``). See: :const:`~graphein.protein.config.GRAPH_ATOMS` and :const:`~graphein.protein.config.GRANULARITY_OPTS`. :type granularity: str + :param name: given name for graph + :type name: str + :param pdb_code: PDB ID / Accession code, if the PDB is available on the PDB database. + :type pdb_code: str + :param pdb_path: path to local PDB file. + :type pdb_path: str :return: Returns initial protein structure graph with metadata. :rtype: nx.Graph """ + + assert (pdb_code and not pdb_path) or (not pdb_code and pdb_path), 'Either a PDB ID or a path to a local PDB file' \ + ' must be specified to read a PDB' + + # Get name for graph if no name was provided + if not name: + if pdb_path: + name = get_protein_name_from_filename(pdb_path) + else: + name = pdb_code + G = nx.Graph( - name=pdb_id, - pdb_id=pdb_id, + name=name, + pdb_code=pdb_code, + pdb_path=pdb_path, chain_ids=list(protein_df["chain_id"].unique()), pdb_df=protein_df, raw_pdb_df=raw_pdb_df, @@ -501,6 +520,7 @@ def compute_edges( def construct_graph( config: Optional[ProteinGraphConfig] = None, + name: Optional[str] = None, pdb_path: Optional[str] = None, pdb_code: Optional[str] = None, chain_selection: str = "all", @@ -520,9 +540,11 @@ def construct_graph( :param config: :class:`~graphein.protein.config.ProteinGraphConfig` object. If None, defaults to config in ``graphein.protein.config``. :type config: graphein.protein.config.ProteinGraphConfig, optional + :param name: an optional given name for the graph. the PDB ID or PDB file name will be used if not specified. + :type name: str, optional :param pdb_path: Path to ``pdb_file`` to build graph from. Default is ``None``. :type pdb_path: str, optional - :param pdb_code: 4-character PDB accession pdb_code to build graph from. Default is ``None``. + :param pdb_code: 4-character PDB pdb ID / accession code to build graph from. Default is ``None``. :type pdb_code: str, optional :param chain_selection: String of polypeptide chains to include in graph. E.g ``"ABDF"`` or ``"all"``. Default is ``"all"``. :type chain_selection: str @@ -540,14 +562,13 @@ def construct_graph( :type: nx.Graph """ + assert (pdb_code and not pdb_path) or (not pdb_code and pdb_path), 'Either a PDB ID or a path to a local PDB file' \ + ' must be specified to construct a graph' + # If no config is provided, use default if config is None: config = ProteinGraphConfig() - # Get name from pdb_file is no pdb_code is provided - if pdb_path and (pdb_code is None): - pdb_code = get_protein_name_from_filename(pdb_path) - # If config params are provided, overwrite them config.protein_df_processing_functions = ( df_processing_funcs @@ -589,7 +610,9 @@ def construct_graph( g = initialise_graph_with_metadata( protein_df=protein_df, raw_pdb_df=raw_df.df["ATOM"], - pdb_id=pdb_code, + name=name, + pdb_code=pdb_code, + pdb_path=pdb_path, granularity=config.granularity, ) # Add nodes to graph