Skip to content

Commit

Permalink
Merge pull request #98 from biocore/bugfixes-jointrpca
Browse files Browse the repository at this point in the history
Bug fixes for Joint-RPCA & TEMPTED
  • Loading branch information
cameronmartino authored Oct 1, 2024
2 parents 0594353 + fd42ad1 commit 8de57ac
Show file tree
Hide file tree
Showing 13 changed files with 12,877 additions and 12,825 deletions.
13 changes: 13 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,17 @@

v0.0.12 (2024-10-01)

### Bug fixes

* Joint-RPCA re-centering bug.
* see issue #97
* Updated for the newest version of Pandas.
* see issue #96
* Reduce unecessary numpy warnings regarding log on unseen data.
* see issue #78
* Fix filter and ordering with TEMPTED
* see issue #92 and issue #93

v0.0.11 (2024-06-24)

### Bug fixes
Expand Down
2 changes: 1 addition & 1 deletion gemelli/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,4 @@
# The full license is in the file COPYING.txt, distributed with this software.
# ----------------------------------------------------------------------------

__version__ = "0.0.11"
__version__ = "0.0.12"
2 changes: 0 additions & 2 deletions gemelli/optspace.py
Original file line number Diff line number Diff line change
Expand Up @@ -374,10 +374,8 @@ def joint_solve(self, multiple_obs):
S_shared = np.diag(S_shared)
S_shared = S_shared / np.linalg.norm(S_shared)
U_shared = np.average(sample_loadings, axis=0)
U_shared -= U_shared.mean(0)
feature_loadings = [(S_shared).dot(v_i.T).T
for v_i in feature_loadings]

# compensates the smaller average size of
# observed values vs. missing
S_shared = S_shared / rescal_param
Expand Down
9 changes: 6 additions & 3 deletions gemelli/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -420,9 +420,11 @@ def matrix_rclr(mat, branch_lengths=None):
raise ValueError('Data-matrix contains nans')
# take the log of the sample centered data
if branch_lengths is not None:
mat = np.log(matrix_closure(matrix_closure(mat) * branch_lengths))
with np.errstate(divide='ignore'):
mat = np.log(matrix_closure(matrix_closure(mat) * branch_lengths))
else:
mat = np.log(matrix_closure(mat))
with np.errstate(divide='ignore'):
mat = np.log(matrix_closure(mat))
# generate a mask of missing values
mask = [True] * mat.shape[0] * mat.shape[1]
mask = np.array(mat).reshape(mat.shape)
Expand Down Expand Up @@ -586,7 +588,8 @@ def matrix_closure(mat):
"""

mat = np.atleast_2d(mat)
mat = mat / mat.sum(axis=1, keepdims=True)
with np.errstate(divide='ignore', invalid='ignore'):
mat = mat / mat.sum(axis=1, keepdims=True)

return mat.squeeze()

Expand Down
2 changes: 1 addition & 1 deletion gemelli/q2/tests/test_transformer.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import pandas as pd
import qiime2
import numpy as np
import pandas.util.testing as pdt
import pandas.testing as pdt
from skbio.util import get_data_path
from qiime2.plugin.testing import TestPluginBase
from gemelli.q2._format import TrajectoryFormat
Expand Down
28 changes: 21 additions & 7 deletions gemelli/rpca.py
Original file line number Diff line number Diff line change
Expand Up @@ -596,8 +596,9 @@ def frequency_filter(val, id_, md):
raise ValueError('Data-table contains duplicate sample IDs')
if len(table.ids('observation')) != len(set(table.ids('observation'))):
raise ValueError('Data-table contains duplicate feature IDs')
# ensure empty samples / features are removed
table = table.remove_empty()
if min_sample_count is not None:
# ensure empty samples / features are removed
table = table.remove_empty(inplace=False)

return table

Expand Down Expand Up @@ -746,10 +747,22 @@ def joint_rpca(tables: biom.Table,
pd.DataFrame):
sample_metadata = sample_metadata.to_dataframe()
if sample_metadata is None or train_test_column is None:
test_samples = sorted(list(shared_all_samples))[:n_test_samples]
# choose samples based on RPCA loadings on first table
# a somewhat intelligent way of making sure the test
# samples are representative of the data.
ord_sort, _ = optspace_helper(rclr_tables[0].T.values,
rclr_tables[0].index,
rclr_tables[0].columns,
n_components=n_components)
ord_sort = list(ord_sort.samples.iloc[:, 0].sort_values().index)
test_samples = np.round(np.linspace(0,
len(ord_sort) - 1,
n_test_samples)
).astype(int)
test_samples = [ord_sort[i] for i in test_samples]
train_samples = list(set(shared_all_samples) - set(test_samples))
else:
sample_metadata = sample_metadata.loc[shared_all_samples, :]
sample_metadata = sample_metadata.loc[list(shared_all_samples), :]
train_samples = sample_metadata[train_test_column] == 'train'
test_samples = sample_metadata[train_test_column] == 'test'
train_samples = sample_metadata[train_samples].index
Expand Down Expand Up @@ -909,9 +922,10 @@ def transform(ordination: OrdinationResults,
' the features in the ordination.')
elif subset_tables:
unshared_N = len(set(rclr_table_df.index)) - len(shared_features)
warnings.warn('Removing %i features(s) in table(s)'
' but not the ordination.'
% (unshared_N), RuntimeWarning)
if unshared_N != 0:
warnings.warn('Removing %i features(s) in table(s)'
' but not the ordination.'
% (unshared_N), RuntimeWarning)
else:
raise ValueError('Features in the input table(s) not in'
' the features in the ordination.'
Expand Down
400 changes: 200 additions & 200 deletions gemelli/scripts/tests/rpca_data/distance-matrix.tsv

Large diffs are not rendered by default.

402 changes: 201 additions & 201 deletions gemelli/scripts/tests/rpca_data/expected-joint-distance-matrix.tsv

Large diffs are not rendered by default.

Loading

0 comments on commit 8de57ac

Please sign in to comment.