Merge pull request #98 from biocore/bugfixes-jointrpca

Bug fixes for Joint-RPCA & TEMPTED
biocore · Oct 1, 2024 · 8de57ac · 8de57ac
2 parents 0594353 + fd42ad1
commit 8de57ac
Show file tree

Hide file tree

Showing 13 changed files with 12,877 additions and 12,825 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,4 +1,17 @@
 
+v0.0.12 (2024-10-01)
+
+### Bug fixes
+
+* Joint-RPCA re-centering bug.
+    * see issue #97
+* Updated for the newest version of Pandas.
+    * see issue #96
+* Reduce unecessary numpy warnings regarding log on unseen data.
+    * see issue #78
+* Fix filter and ordering with TEMPTED
+    * see issue #92 and issue #93
+
 v0.0.11 (2024-06-24)
 
 ### Bug fixes

diff --git a/gemelli/__init__.py b/gemelli/__init__.py
@@ -6,4 +6,4 @@
 # The full license is in the file COPYING.txt, distributed with this software.
 # ----------------------------------------------------------------------------
 
-__version__ = "0.0.11"
+__version__ = "0.0.12"
diff --git a/gemelli/optspace.py b/gemelli/optspace.py
@@ -374,10 +374,8 @@ def joint_solve(self, multiple_obs):
             S_shared = np.diag(S_shared)
             S_shared = S_shared / np.linalg.norm(S_shared)
             U_shared = np.average(sample_loadings, axis=0)
-            U_shared -= U_shared.mean(0)
             feature_loadings = [(S_shared).dot(v_i.T).T
                                 for v_i in feature_loadings]
-
         # compensates the smaller average size of
         # observed values vs. missing
         S_shared = S_shared / rescal_param

diff --git a/gemelli/preprocessing.py b/gemelli/preprocessing.py
@@ -420,9 +420,11 @@ def matrix_rclr(mat, branch_lengths=None):
         raise ValueError('Data-matrix contains nans')
     # take the log of the sample centered data
     if branch_lengths is not None:
-        mat = np.log(matrix_closure(matrix_closure(mat) * branch_lengths))
+        with np.errstate(divide='ignore'):
+            mat = np.log(matrix_closure(matrix_closure(mat) * branch_lengths))
     else:
-        mat = np.log(matrix_closure(mat))
+        with np.errstate(divide='ignore'):
+            mat = np.log(matrix_closure(mat))
     # generate a mask of missing values
     mask = [True] * mat.shape[0] * mat.shape[1]
     mask = np.array(mat).reshape(mat.shape)
@@ -586,7 +588,8 @@ def matrix_closure(mat):
     """
 
     mat = np.atleast_2d(mat)
-    mat = mat / mat.sum(axis=1, keepdims=True)
+    with np.errstate(divide='ignore', invalid='ignore'):
+        mat = mat / mat.sum(axis=1, keepdims=True)
 
     return mat.squeeze()
 

diff --git a/gemelli/q2/tests/test_transformer.py b/gemelli/q2/tests/test_transformer.py
@@ -1,7 +1,7 @@
 import pandas as pd
 import qiime2
 import numpy as np
-import pandas.util.testing as pdt
+import pandas.testing as pdt
 from skbio.util import get_data_path
 from qiime2.plugin.testing import TestPluginBase
 from gemelli.q2._format import TrajectoryFormat

diff --git a/gemelli/rpca.py b/gemelli/rpca.py
@@ -596,8 +596,9 @@ def frequency_filter(val, id_, md):
         raise ValueError('Data-table contains duplicate sample IDs')
     if len(table.ids('observation')) != len(set(table.ids('observation'))):
         raise ValueError('Data-table contains duplicate feature IDs')
-    # ensure empty samples / features are removed
-    table = table.remove_empty()
+    if min_sample_count is not None:
+        # ensure empty samples / features are removed
+        table = table.remove_empty(inplace=False)
 
     return table
 
@@ -746,10 +747,22 @@ def joint_rpca(tables: biom.Table,
                                                       pd.DataFrame):
         sample_metadata = sample_metadata.to_dataframe()
     if sample_metadata is None or train_test_column is None:
-        test_samples = sorted(list(shared_all_samples))[:n_test_samples]
+        # choose samples based on RPCA loadings on first table
+        # a somewhat intelligent way of making sure the test
+        # samples are representative of the data.
+        ord_sort, _ = optspace_helper(rclr_tables[0].T.values,
+                                      rclr_tables[0].index,
+                                      rclr_tables[0].columns,
+                                      n_components=n_components)
+        ord_sort = list(ord_sort.samples.iloc[:, 0].sort_values().index)
+        test_samples = np.round(np.linspace(0,
+                                            len(ord_sort) - 1,
+                                            n_test_samples)
+                                ).astype(int)
+        test_samples = [ord_sort[i] for i in test_samples]
         train_samples = list(set(shared_all_samples) - set(test_samples))
     else:
-        sample_metadata = sample_metadata.loc[shared_all_samples, :]
+        sample_metadata = sample_metadata.loc[list(shared_all_samples), :]
         train_samples = sample_metadata[train_test_column] == 'train'
         test_samples = sample_metadata[train_test_column] == 'test'
         train_samples = sample_metadata[train_samples].index
@@ -909,9 +922,10 @@ def transform(ordination: OrdinationResults,
                          ' the features in the ordination.')
     elif subset_tables:
         unshared_N = len(set(rclr_table_df.index)) - len(shared_features)
-        warnings.warn('Removing %i features(s) in table(s)'
-                      ' but not the ordination.'
-                      % (unshared_N), RuntimeWarning)
+        if unshared_N != 0:
+            warnings.warn('Removing %i features(s) in table(s)'
+                          ' but not the ordination.'
+                          % (unshared_N), RuntimeWarning)
     else:
         raise ValueError('Features in the input table(s) not in'
                          ' the features in the ordination.'

diff --git a/gemelli/scripts/tests/rpca_data/distance-matrix.tsv b/gemelli/scripts/tests/rpca_data/distance-matrix.tsv
diff --git a/gemelli/scripts/tests/rpca_data/expected-joint-distance-matrix.tsv b/gemelli/scripts/tests/rpca_data/expected-joint-distance-matrix.tsv