Skip to content

Commit

Permalink
refactored protein graphs to always have as params: name, pdb_code, p…
Browse files Browse the repository at this point in the history
…db_path. Also fixes a-r-j#171, which was not properly fixed by a-r-j#172
  • Loading branch information
avivko committed May 22, 2022
1 parent 2262250 commit 167b5e2
Show file tree
Hide file tree
Showing 2 changed files with 54 additions and 19 deletions.
24 changes: 18 additions & 6 deletions graphein/protein/features/nodes/dssp.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,20 +94,26 @@ def process_dssp_df(df: pd.DataFrame) -> pd.DataFrame:
return df


def add_dssp_df(G: nx.Graph, dssp_config: Optional[DSSPConfig]) -> nx.Graph:
def add_dssp_df(G: nx.Graph, dssp_config: Optional[DSSPConfig], new_pdb_path: str = None) -> nx.Graph:
"""
Construct DSSP dataframe and add as graph level variable to protein graph
:param G: Input protein graph
:param G: nx.Graph
:param dssp_config: DSSPConfig object. Specifies which executable to run. Located in graphein.protein.config
:type dssp_config: DSSPConfig, optional
:param new_pdb_path: specifies a new path the local pdb file in case it was moved.
:type dssp_config: DSSPConfig, optional
:return: Protein graph with DSSP dataframe added
:rtype: nx.Graph
"""

config = G.graph["config"]
pdb_id = G.graph["pdb_id"]
pdb_code = G.graph["pdb_code"]
if new_pdb_path:
pdb_path = new_pdb_path
else:
pdb_path = G.graph["pdb_path"]

# Extract DSSP executable
executable = dssp_config.executable
Expand All @@ -117,11 +123,17 @@ def add_dssp_df(G: nx.Graph, dssp_config: Optional[DSSPConfig]) -> nx.Graph:
executable
), "DSSP must be on PATH and marked as an executable"

# Check for existence of pdb file. If not, download it.
if not os.path.isfile(config.pdb_dir / (pdb_id + ".pdb")):
pdb_file = download_pdb(config, pdb_id)
if pdb_code:
# Check for existence of pdb file. If not, download it.
if not os.path.isfile(config.pdb_dir / (pdb_code + ".pdb")):
pdb_file = download_pdb(config, pdb_code)
else:
pdb_file = config.pdb_dir / (pdb_code + ".pdb")
else:
pdb_file = config.pdb_dir / (pdb_id + ".pdb")
assert os.path.isfile(pdb_path), f"The PDB file could not be found under the path: " \
f" {G.graph['pdb_path']}. If the file was moved, specify the new " \
f" path using the parameter new_pdb_path"
pdb_file = pdb_path

if config.verbose:
print(f"Using DSSP executable '{executable}'")
Expand Down
49 changes: 36 additions & 13 deletions graphein/protein/graphs.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,8 +69,9 @@ def read_pdb_to_dataframe(
:returns: ``pd.DataFrame`` containing protein structure
:rtype: pd.DataFrame
"""
if pdb_code is None and pdb_path is None:
raise NameError("One of pdb_code or pdb_path must be specified!")

assert (pdb_code and not pdb_path) or (not pdb_code and pdb_path), 'Either a PDB ID or a path to a local PDB file' \
' must be specified to read a PDB'

atomic_df = (
PandasPdb().read_pdb(pdb_path)
Expand Down Expand Up @@ -349,8 +350,10 @@ def select_chains(
def initialise_graph_with_metadata(
protein_df: pd.DataFrame,
raw_pdb_df: pd.DataFrame,
pdb_id: str,
granularity: str,
name: Optional[str] = None,
pdb_code: Optional[str] = None,
pdb_path: Optional[str] = None,
) -> nx.Graph:
"""
Initializes the nx Graph object with initial metadata.
Expand All @@ -359,17 +362,33 @@ def initialise_graph_with_metadata(
:type protein_df: pd.DataFrame
:param raw_pdb_df: Unprocessed dataframe of protein structure for comparison and traceability downstream.
:type raw_pdb_df: pd.DataFrame
:param pdb_id: PDB Accession code.
:type pdb_id: str
:param granularity: Granularity of the graph (eg ``"atom"``, ``"CA"``, ``"CB"`` etc or ``"centroid"``).
See: :const:`~graphein.protein.config.GRAPH_ATOMS` and :const:`~graphein.protein.config.GRANULARITY_OPTS`.
:type granularity: str
:param name: given name for graph
:type name: str
:param pdb_code: PDB ID / Accession code, if the PDB is available on the PDB database.
:type pdb_code: str
:param pdb_path: path to local PDB file.
:type pdb_path: str
:return: Returns initial protein structure graph with metadata.
:rtype: nx.Graph
"""

assert (pdb_code and not pdb_path) or (not pdb_code and pdb_path), 'Either a PDB ID or a path to a local PDB file' \
' must be specified to read a PDB'

# Get name for graph if no name was provided
if not name:
if pdb_path:
name = get_protein_name_from_filename(pdb_path)
else:
name = pdb_code

G = nx.Graph(
name=pdb_id,
pdb_id=pdb_id,
name=name,
pdb_code=pdb_code,
pdb_path=pdb_path,
chain_ids=list(protein_df["chain_id"].unique()),
pdb_df=protein_df,
raw_pdb_df=raw_pdb_df,
Expand Down Expand Up @@ -501,6 +520,7 @@ def compute_edges(

def construct_graph(
config: Optional[ProteinGraphConfig] = None,
name: Optional[str] = None,
pdb_path: Optional[str] = None,
pdb_code: Optional[str] = None,
chain_selection: str = "all",
Expand All @@ -520,9 +540,11 @@ def construct_graph(
:param config: :class:`~graphein.protein.config.ProteinGraphConfig` object. If None, defaults to config in ``graphein.protein.config``.
:type config: graphein.protein.config.ProteinGraphConfig, optional
:param name: an optional given name for the graph. the PDB ID or PDB file name will be used if not specified.
:type name: str, optional
:param pdb_path: Path to ``pdb_file`` to build graph from. Default is ``None``.
:type pdb_path: str, optional
:param pdb_code: 4-character PDB accession pdb_code to build graph from. Default is ``None``.
:param pdb_code: 4-character PDB pdb ID / accession code to build graph from. Default is ``None``.
:type pdb_code: str, optional
:param chain_selection: String of polypeptide chains to include in graph. E.g ``"ABDF"`` or ``"all"``. Default is ``"all"``.
:type chain_selection: str
Expand All @@ -540,14 +562,13 @@ def construct_graph(
:type: nx.Graph
"""

assert (pdb_code and not pdb_path) or (not pdb_code and pdb_path), 'Either a PDB ID or a path to a local PDB file' \
' must be specified to construct a graph'

# If no config is provided, use default
if config is None:
config = ProteinGraphConfig()

# Get name from pdb_file is no pdb_code is provided
if pdb_path and (pdb_code is None):
pdb_code = get_protein_name_from_filename(pdb_path)

# If config params are provided, overwrite them
config.protein_df_processing_functions = (
df_processing_funcs
Expand Down Expand Up @@ -589,7 +610,9 @@ def construct_graph(
g = initialise_graph_with_metadata(
protein_df=protein_df,
raw_pdb_df=raw_df.df["ATOM"],
pdb_id=pdb_code,
name=name,
pdb_code=pdb_code,
pdb_path=pdb_path,
granularity=config.granularity,
)
# Add nodes to graph
Expand Down

0 comments on commit 167b5e2

Please sign in to comment.