Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Assign view paired #324

Merged
merged 12 commits into from
Apr 1, 2022
71 changes: 28 additions & 43 deletions cooltools/api/snipping.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
is_cooler_balanced,
is_valid_expected,
)
from ..lib.common import assign_regions, make_cooler_view
from ..lib.common import assign_regions, make_cooler_view, assign_view_paired

from ..lib.numutils import LazyToeplitz
import warnings
Expand Down Expand Up @@ -82,12 +82,7 @@ def expand_align_features(features_df, flank, resolution, format="bed"):


def make_bin_aligned_windows(
binsize,
chroms,
centers_bp,
flank_bp=0,
region_start_bp=0,
ignore_index=False,
binsize, chroms, centers_bp, flank_bp=0, region_start_bp=0, ignore_index=False,
):
"""
Convert genomic loci into bin spans on a fixed bin-segmentation of a
Expand Down Expand Up @@ -305,10 +300,7 @@ def __init__(self, clr, cooler_opts=None, view_df=None, min_diag=2):
# Make sure view_df is a proper viewframe
try:
_ = is_compatible_viewframe(
view_df,
clr,
check_sorting=True,
raise_errors=True,
view_df, clr, check_sorting=True, raise_errors=True,
)
except Exception as e:
raise ValueError(
Expand Down Expand Up @@ -436,10 +428,7 @@ def __init__(
# Make sure view_df is a proper viewframe
try:
_ = is_compatible_viewframe(
view_df,
clr,
check_sorting=True,
raise_errors=True,
view_df, clr, check_sorting=True, raise_errors=True,
)
except Exception as e:
raise ValueError(
Expand All @@ -452,9 +441,7 @@ def __init__(
"cis",
view_df,
verify_cooler=clr,
expected_value_cols=[
self.expected_value_col,
],
expected_value_cols=[self.expected_value_col,],
raise_errors=True,
)
except Exception as e:
Expand Down Expand Up @@ -582,10 +569,7 @@ def __init__(
# Make sure view_df is a proper viewframe
try:
_ = is_compatible_viewframe(
view_df,
clr,
check_sorting=True,
raise_errors=True,
view_df, clr, check_sorting=True, raise_errors=True,
)
except Exception as e:
raise ValueError(
Expand All @@ -598,9 +582,7 @@ def __init__(
"cis",
view_df,
verify_cooler=clr,
expected_value_cols=[
self.expected_value_col,
],
expected_value_cols=[self.expected_value_col,],
raise_errors=True,
)
except Exception as e:
Expand Down Expand Up @@ -660,6 +642,7 @@ def pileup(
clr,
features_df,
view_df=None,
view_name_col="name",
expected_df=None,
expected_value_col="balanced.avg",
flank=100_000,
Expand Down Expand Up @@ -713,8 +696,22 @@ def pileup(
else:
raise ValueError("Unknown feature_df format")

features_df = assign_regions(features_df, view_df)
# TODO: switch to bioframe.assign_view upon update
if feature_type == "bed":
features_df = bioframe.assign_view(
features_df, view_df, df_view_col="region", view_name_col=view_name_col,
)
else:
features_df = assign_view_paired(
features_df, view_df, view_name_col=view_name_col
)
# Now we consolidate the region annotations into one column `region`
# Features that cross between regions get region `None`
# The rest get the `region1` value assigned to a new column `region`
features_df["region"] = np.where(
features_df["region1"] == features_df["region1"],
features_df["region1"],
None,
)

if flank is not None:
features_df = expand_align_features(
Expand All @@ -736,10 +733,7 @@ def pileup(
else:
try:
_ = is_compatible_viewframe(
view_df,
clr,
check_sorting=True,
raise_errors=True,
view_df, clr, check_sorting=True, raise_errors=True,
)
except Exception as e:
raise ValueError("view_df is not a valid viewframe or incompatible") from e
Expand Down Expand Up @@ -770,27 +764,18 @@ def pileup(
if feature_type == "bed":
features_df[["lo", "hi"]] = (
features_df[["lo", "hi"]]
.subtract(
features_df["region_offset"].fillna(0),
axis=0,
)
.subtract(features_df["region_offset"].fillna(0), axis=0,)
.astype(int)
)
else:
features_df[["lo1", "hi1"]] = (
features_df[["lo1", "hi1"]]
.subtract(
features_df["region_offset"].fillna(0),
axis=0,
)
.subtract(features_df["region_offset"].fillna(0), axis=0,)
.astype(int)
)
features_df[["lo2", "hi2"]] = (
features_df[["lo2", "hi2"]]
.subtract(
features_df["region_offset"].fillna(0),
axis=0,
)
.subtract(features_df["region_offset"].fillna(0), axis=0,)
.astype(int)
)

Expand Down
76 changes: 74 additions & 2 deletions cooltools/lib/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,76 @@
import bioframe


def assign_view_paired(
features,
view_df,
cols_paired=["chrom1", "start1", "end1", "chrom2", "start2", "end2"],
cols_view=["chrom", "start", "end"],
features_view_cols=["region1", "region2"],
view_name_col="name",
drop_unassigned=False,
):
"""Assign region names from the view to each feature

Assigns a regular 1D view independently to each side of a bedpe-style dataframe.
Will add two columns with region names (`features_view_cols`)

Parameters
----------
features : pd.DataFrame
bedpe-style dataframe
view_df : pandas.DataFrame
ViewFrame specifying region start and ends for assignment. Attempts to
convert dictionary and pd.Series formats to viewFrames.
cols_paired : list of str
he names of columns containing the chromosome, start and end of the
genomic intervals. The default values are 'chrom', 'start', 'end'.
cols_view : list of str
The names of columns containing the chromosome, start and end of the
genomic intervals in the view. The default values are 'chrom', 'start', 'end'.
features_view_cols : list of str
Names of the columns where to save the assigned region names
view_name_col : str
Column of ``view_df`` with region names. Default 'name'.
drop_unassigned : bool
If True, drop intervals in df that do not overlap a region in the view.
Default False.
"""
features = features.copy()
features.reset_index(inplace=True, drop=True)

cols_left = cols_paired[:3]
cols_right = cols_paired[3:]

bioframe.core.checks.is_bedframe(features, raise_errors=True, cols=cols_left)
bioframe.core.checks.is_bedframe(features, raise_errors=True, cols=cols_right)
view_df = bioframe.core.construction.make_viewframe(
view_df, view_name_col=view_name_col, cols=cols_view
)
features = bioframe.assign_view(
features,
view_df,
drop_unassigned=drop_unassigned,
df_view_col=features_view_cols[0],
view_name_col=view_name_col,
cols=cols_left,
cols_view=cols_view,
)
features[cols_right[1:]] = features[cols_right[1:]].astype(
int
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you check if this cast is needed for the final output - i.e. for after the second assign_view application ? just in case (also is it an expected behavior on bioframe side of things ? issue-worthy or not ?)

Copy link
Member Author

@Phlya Phlya Apr 1, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is because the df start off with numpy int dtypes, and then bioframe converts the other columns to numpy float64. But if it is pd.Int64Dtype from the beginning, this cast is not necessary.

In the second run of assign_view the cols_right[1:] get cast to pd.Int64Dtype, and the other ones are already that from this first cast... It's confusing. But the second cast is not necessary.

I don't know if this should be an issue in bioframe, I remember extended discussions about dtypes there and I think we couldn't find a way to avoid this sort of weird casting...

) # gets cast to float above...
features = bioframe.assign_view(
features,
view_df,
drop_unassigned=drop_unassigned,
df_view_col=features_view_cols[1],
view_name_col=view_name_col,
cols=cols_right,
cols_view=cols_view,
)
return features


def assign_regions(features, supports):
"""
DEPRECATED. Will be removed in the future versions and replaced with bioframe.overlap()
Expand Down Expand Up @@ -278,7 +348,10 @@ def align_track_with_cooler(
.copy()
.merge(
track.rename(columns={c: "chrom", s: "start", e: "end", v: "value"}),
how="left", on=["chrom", "start"], suffixes=("", "_"))
how="left",
on=["chrom", "start"],
suffixes=("", "_"),
)
)

if clr_weight_name:
Expand Down Expand Up @@ -310,5 +383,4 @@ def align_track_with_cooler(
if mask_bad_bins:
clr_track.loc[~valid_bins, "value"] = np.nan


return clr_track[["chrom", "start", "end", "value"]]
20 changes: 15 additions & 5 deletions tests/test_snipping.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,12 +104,16 @@ def test_pileup(request):
"end": [107_000_000, 113_000_000],
}
)
stack = cooltools.api.snipping.pileup(clr, windows, view_df, exp, flank=None)

stack = cooltools.api.snipping.pileup(
clr, windows, view_df=view_df, expected_df=exp, flank=None
)

# Check that the size of snips is OK and there are two of them:
assert stack.shape == (5, 5, 2)

# II.
# Example off-diagonal features, two regions from annotated genomic regions:
# Example off-diagonal features, two features from annotated genomic regions:
windows = pd.DataFrame(
{
"chrom1": ["chr1", "chr1"],
Expand All @@ -120,7 +124,9 @@ def test_pileup(request):
"end2": [112_000_000, 118_000_000],
}
)
stack = cooltools.api.snipping.pileup(clr, windows, view_df, exp, flank=None)
stack = cooltools.api.snipping.pileup(
clr, windows, view_df=view_df, expected_df=exp, flank=None
)
# Check that the size of snips is OK and there are two of them:
assert stack.shape == (5, 5, 2)

Expand All @@ -136,7 +142,9 @@ def test_pileup(request):
"end2": [110_000_000, 115_000_000],
}
)
stack = cooltools.api.snipping.pileup(clr, windows, view_df, exp, flank=None)
stack = cooltools.api.snipping.pileup(
clr, windows, view_df=view_df, expected_df=exp, flank=None
)
# Check that the size of snips is OK and there are two of them:
assert stack.shape == (5, 5, 2)

Expand Down Expand Up @@ -172,7 +180,9 @@ def test_pileup(request):
}
)
with pytest.raises(ValueError):
stack = cooltools.api.snipping.pileup(clr, windows, view_df, exp, flank=None)
stack = cooltools.api.snipping.pileup(
clr, windows, view_df=view_df, expected_df=exp, flank=None
)

# DRAFT # Should work with force=True:
# stack = cooltools.api.snipping.pileup(clr, windows, view_df, exp, flank=0, force=True)
Expand Down