Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement add_remark for PandasPdb #129

Merged
merged 10 commits into from
Jul 31, 2023
82 changes: 81 additions & 1 deletion biopandas/pdb/pandas_pdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import gzip
import sys
import warnings
import textwrap
from copy import deepcopy
from io import StringIO
from typing import List, Optional
Expand Down Expand Up @@ -225,7 +226,7 @@ def impute_element(self, records=("ATOM", "HETATM"), inplace=False):
Coordinate sections for which the element symbols should be
imputed.

inplace : bool, (default: False
inplace : bool, default: False
Performs the operation in-place if True and returns a copy of the
PDB DataFrame otherwise.

Expand All @@ -246,6 +247,85 @@ def impute_element(self, records=("ATOM", "HETATM"), inplace=False):
lambda x: x[0][1] if len(x[1]) == 3 else x[0][0], axis=1
)
return t

def add_remark(self, code, text='', indent=0):
"""Add custom REMARK entry.

The remark will be inserted to preserve the ordering of REMARK codes, i.e. if the code is
`n` it will be added after all remarks with codes less or equal to `n`. If the object does
not store any remarks the remark will be inserted right before the first of ATOM, HETATM or
ANISOU records.

Parameters
----------
code : int
REMARK code according to PDB standards.

text : str
The text of the remark. If the text does not fit into a single line it will be wrapped
into multiple lines of REMARK entries. Likewise, if the text contains new line
characters it will be split accordingly.

indent : int, default: 0
Number of white spaces inserted before the text of the remark.

Returns
---------
Nothing

"""
# Prepare info from self
if 'OTHERS' in self.df:
df_others = self.df['OTHERS']
else:
df_others = pd.DataFrame(columns=['record_name', 'entry', 'line_idx'])
record_types = list(filter(lambda x: x in self.df, ['ATOM', 'HETATM', 'ANISOU']))
remarks = df_others[df_others['record_name'] == 'REMARK']['entry']

# Find index and line_idx where to insert the remark to preserve remark code order
if len(remarks):
remark_codes = remarks.apply(lambda x: x.split(maxsplit=1)[0]).astype(int)
insertion_pos = remark_codes.searchsorted(code, side='right')
if insertion_pos < len(remark_codes): # Remark in the middle
insertion_idx = remark_codes.index[insertion_pos]
insertion_line_idx = df_others.loc[insertion_idx]['line_idx']
else: # Last remark
insertion_idx = len(remark_codes)
insertion_line_idx = df_others['line_idx'].iloc[-1] + 1
else: # First remark
insertion_idx = 0
insertion_line_idx = min([self.df[r]['line_idx'].min() for r in record_types])

# Wrap remark to fit into 80 characters per line and add indentation
wrapper = textwrap.TextWrapper(width=80 - (11 + indent))
lines = sum([wrapper.wrap(l.strip()) or [' '] for l in text.split('\n')], [])
lines = list(map(lambda x: f'{code:4} ' + indent*' ' + x, lines))

# Shift data frame indices and row indices to create space for the remark
# Create space in OTHERS
line_idx = df_others['line_idx'].copy()
line_idx[line_idx >= insertion_line_idx] += len(lines)
df_others['line_idx'] = line_idx
index = pd.Series(df_others.index.copy())
index[index >= insertion_idx] += len(lines)
df_others.index = index
# Shift all other record types that follow inserted remark
for records in record_types:
df_records = self.df[records]
if not insertion_line_idx > df_records['line_idx'].max():
df_records['line_idx'] += len(lines)

# Put remark into 'OTHERS' data frame
df_remark = {
idx: ['REMARK', line, line_idx]
for idx, line, line_idx in zip(
range(insertion_idx, insertion_idx + len(lines)),
lines,
range(insertion_line_idx, insertion_line_idx + len(lines)),
)
}
df_remark = pd.DataFrame.from_dict(df_remark, orient='index', columns=df_others.columns)
self.df['OTHERS'] = pd.concat([df_others, df_remark]).sort_index()

@staticmethod
def rmsd(df1, df2, s=None, invert=False, decimals=4):
Expand Down
69 changes: 69 additions & 0 deletions biopandas/pdb/tests/test_write_pdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@
TESTDATA_FILENAME2 = os.path.join(
os.path.dirname(__file__), "data", "4eiy_anisouchunk.pdb"
)
TESTDATA_FILENAME3 = os.path.join(
os.path.dirname(__file__), "data", "5mtn_multichain.pdb"
)
OUTFILE = os.path.join(os.path.dirname(__file__), "data", "tmp.pdb")
OUTFILE_GZ = os.path.join(os.path.dirname(__file__), "data", "tmp.pdb.gz")

Expand Down Expand Up @@ -73,6 +76,71 @@ def test_anisou():
assert f1 == four_eiy


def test_add_remark():
"""Test adding a REMARK entry."""
# Add remark
code = 3
remark1 = 'THIS IS A HIGHLY IMPORTANT FREE-TEXT REMARK WHICH IS EXACTLY 80 CHARACTERS LONG.'
remark2 = ''
remark3 = 'THIS IS A NEXT MULTI-LINE INDENTED REMARK\n FOLLOWING THE BLANK REMARK.'
ppdb = PandasPdb()
ppdb.read_pdb(TESTDATA_FILENAME)
n_atoms = len(ppdb.df['ATOM'])
ppdb.add_remark(code, remark1)
ppdb.add_remark(code, remark2)
ppdb.add_remark(code, remark3, 5)
ppdb.to_pdb(path=OUTFILE)

# Test modified file contains remarks
with open(OUTFILE, "r") as f:
f1 = f.read()
expected_substr = (
"REMARK 3 OTHER REFINEMENT REMARKS: HYDROGENS HAVE BEEN ADDED IN THE RIDING \n"
"REMARK 3 POSITIONS \n"
"REMARK 3 THIS IS A HIGHLY IMPORTANT FREE-TEXT REMARK WHICH IS EXACTLY 80 \n"
"REMARK 3 CHARACTERS LONG. \n"
"REMARK 3 \n"
"REMARK 3 THIS IS A NEXT MULTI-LINE INDENTED REMARK \n"
"REMARK 3 FOLLOWING THE BLANK REMARK. \n"
"REMARK 4 \n"
)
assert expected_substr in f1

# Test number of atoms remained the same
ppdb = PandasPdb()
ppdb.read_pdb(OUTFILE)
os.remove(OUTFILE)
assert len(ppdb.df['ATOM']) == n_atoms


def test_introduce_remark():
"""Test introducing a REMARK entry to the file with no remarks."""
# Add remark
code = 3
remark = 'THIS IS A HIGHLY IMPORTANT FREE-TEXT REMARK WHICH IS EXACTLY 80 CHARACTERS LONG.'
indent = 1
ppdb = PandasPdb()
ppdb.read_pdb(TESTDATA_FILENAME3)
n_atoms = len(ppdb.df['ATOM'])
ppdb.add_remark(code, remark, indent)
ppdb.to_pdb(path=OUTFILE)

# Test modified file starts with new remark
with open(OUTFILE, "r") as f:
f1 = f.read()
expected_prefix = (
"REMARK 3 THIS IS A HIGHLY IMPORTANT FREE-TEXT REMARK WHICH IS EXACTLY 80 \n"
"REMARK 3 CHARACTERS LONG. \n"
)
assert f1.startswith(expected_prefix)

# Test number of atoms remained the same
ppdb = PandasPdb()
ppdb.read_pdb(OUTFILE)
os.remove(OUTFILE)
assert len(ppdb.df['ATOM']) == n_atoms


def test_b_factor_shift():
"""Test b_factor shifting one white space when saving the fetched pdb."""
ppdb = PandasPdb()
Expand All @@ -82,3 +150,4 @@ def test_b_factor_shift():
os.remove(OUTFILE)
assert tmp_df[tmp_df["element_symbol"].isnull() | (tmp_df["element_symbol"] == '')].empty
assert not tmp_df[tmp_df["blank_4"].isnull() | (tmp_df["blank_4"] == '')].empty

4 changes: 2 additions & 2 deletions docs/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@ The CHANGELOG for the current development version is available at
[https://github.com/rasbt/biopandas/blob/main/docs/sources/CHANGELOG.md](https://github.com/rasbt/biopandas/blob/main/docs/sources/CHANGELOG.md).


### 0.5.0dev1 (24/5/2023)

### 0.5.0dev1 (31/7/2023)
- Implement add_remark for PandasPdb, (Via [Anton Bushuiev](https://github.com/anton-bushuiev) PR #[129](https://github.com/BioPandas/biopandas/pull/129))
- B_factor shifting one white space issue fix. (Via [Zehra Sarica](https://github.com/zehraacarsarica), PR #[134](https://github.com/BioPandas/biopandas/pull/134))
- Adds support for pathlib. (Via [Anton Bushuiev](https://github.com/anton-bushuiev), PR #[128](https://github.com/BioPandas/biopandas/pull/128))
- Adds support for reading Gzipped MMTF files. (Via [Arian Jamasb](https://github.com/a-r-j), PR #[123](https://github.com/rasbt/biopandas/pull/123/files))
Expand Down