diff --git a/examples/00_quick_start/geoimc_movielens.ipynb b/examples/00_quick_start/geoimc_movielens.ipynb new file mode 100644 index 0000000000..c852973f1e --- /dev/null +++ b/examples/00_quick_start/geoimc_movielens.ipynb @@ -0,0 +1,342 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Geometry Aware Inductive Matrix Completion (GeoIMC)\n", + "\n", + "GeoIMC is an inductive matrix completion algorithm based on the works by Jawanpuria et al. (2019)\n", + "\n", + "Consider the case of MovieLens-100K (ML100K), Let $X \\in R^{m \\times d_1}, Z \\in R^{n \\times d_2} $ be the features of users and movies respectively. Let $M \\in R^{m \\times n}$, be the partially observed ratings matrix. GeoIMC models this matrix as $M = XUBV^TZ^T$, where $U \\in R^{d_1 \\times k}, V \\in R^{d_2 \\times k}, B \\in R^{k \\times k}$ are Orthogonal, Orthogonal, Symmetric Positive-Definite matrices respectively. This Optimization problem is solved by using Pymanopt.\n", + "\n", + "\n", + "This notebook provides an example of how to utilize and evaluate GeoIMC implementation in **reco_utils**\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "import tempfile\n", + "import zipfile\n", + "import pandas as pd\n", + "import numpy as np\n", + "import papermill as pm\n", + "sys.path.append(\"../../\")\n", + "sys.path.append(\"../../reco_utils/recommender/geoimc/\")\n", + "\n", + "from reco_utils.dataset import movielens\n", + "from reco_utils.recommender.geoimc.geoimc_data import ML_100K\n", + "from reco_utils.recommender.geoimc.geoimc_algorithm import IMCProblem\n", + "from reco_utils.recommender.geoimc.geoimc_predict import Inferer\n", + "from reco_utils.recommender.geoimc.geoimc_utils import download_geoimc_features\n", + "from reco_utils.evaluation.python_evaluation import (\n", + " rmse, mae\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# Choose the MovieLens dataset\n", + "MOVIELENS_DATA_SIZE = '100k'\n", + "# Normalize user, item features\n", + "normalize = True\n", + "# Rank (k) of the model\n", + "rank = 300\n", + "# Regularization parameter\n", + "regularizer = 1e-3\n", + "\n", + "# URL to download geoimc ML100K features\n", + "FEATURES_URL = \"https://recodatasets.blob.core.windows.net/geoimc/ml100k-features\"\n", + "\n", + "# Parameters for algorithm convergence\n", + "max_iters = 150000\n", + "max_time = 1000\n", + "verbosity = 1" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Download ML100K dataset and features" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 4.81k/4.81k [00:09<00:00, 519KB/s]\n", + "100%|██████████| 82.0/82.0 [00:01<00:00, 53.7KB/s]\n", + "100%|██████████| 31.0/31.0 [00:00<00:00, 59.7KB/s]\n" + ] + } + ], + "source": [ + "# Create a directory to download ML100K\n", + "dp = tempfile.mkdtemp(suffix='-geoimc')\n", + "movielens.download_movielens(MOVIELENS_DATA_SIZE, f\"{dp}/ml-100k.zip\")\n", + "with zipfile.ZipFile(f\"{dp}/ml-100k.zip\", 'r') as z:\n", + " z.extractall(dp)\n", + "\n", + "download_geoimc_features(FEATURES_URL, [\"item-features.smat\", \"user-features.smat\"], f\"{dp}/ml-100k/\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Load the dataset using the example features provided in helpers\n", + "\n", + "The features were generated using the same method as the work by Xin Dong et al. (2017)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "dataset = ML_100K(\n", + " normalize=normalize,\n", + " target_transform='binarize'\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "dataset.load_data(\n", + " f\"{dp}/ml-100k/\", \n", + " f\"{dp}/ml-100k/user-features.smat\",\n", + " f\"{dp}/ml-100k/item-features.smat\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Characteristics:\n", + "\n", + " target: (943, 1682)\n", + " entities: (943, 1822), (1682, 1923)\n", + "\n", + " training: (80000,)\n", + " training_entities: (943, 1822), (1682, 1923)\n", + "\n", + " testing: (20000,)\n", + " test_entities: (943, 1822), (1682, 1923)\n", + "\n" + ] + } + ], + "source": [ + "print(f\"\"\"Characteristics:\n", + "\n", + " target: {dataset.training_data.data.shape}\n", + " entities: {dataset.entities[0].shape}, {dataset.entities[1].shape}\n", + "\n", + " training: {dataset.training_data.get_data().data.shape}\n", + " training_entities: {dataset.training_data.get_entity(\"row\").shape}, {dataset.training_data.get_entity(\"col\").shape}\n", + "\n", + " testing: {dataset.test_data.get_data().data.shape}\n", + " test_entities: {dataset.test_data.get_entity(\"row\").shape}, {dataset.test_data.get_entity(\"col\").shape}\n", + "\"\"\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. Initialize the IMC problem" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "np.random.seed(10)\n", + "prblm = IMCProblem(\n", + " dataset.training_data,\n", + " lambda1=regularizer,\n", + " rank=rank\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Optimizing...\n", + "Terminated - max time reached after 1844 iterations.\n", + "\n" + ] + } + ], + "source": [ + "# Solve the Optimization problem\n", + "prblm.solve(\n", + " max_time,\n", + " max_iters,\n", + " verbosity\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "# Initialize an inferer\n", + "inferer = Inferer(\n", + " method='dot'\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "# Predict using the parametrized matrices\n", + "predictions = inferer.infer(\n", + " dataset.test_data,\n", + " prblm.W\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "# Prepare the test, predicted dataframes\n", + "user_ids = dataset.test_data.get_data().tocoo().row\n", + "item_ids = dataset.test_data.get_data().tocoo().col\n", + "test_df = pd.DataFrame(\n", + " data={\n", + " \"userID\": user_ids,\n", + " \"itemID\": item_ids,\n", + " \"rating\": dataset.test_data.get_data().data\n", + " }\n", + ")\n", + "predictions_df = pd.DataFrame(\n", + " data={\n", + " \"userID\": user_ids,\n", + " \"itemID\": item_ids,\n", + " \"prediction\": [predictions[uid, iid] for uid, iid in list(zip(user_ids, item_ids))]\n", + " }\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RMSE: 0.49632302257817473\n", + "MAE: 0.47524377750493757\n", + "\n" + ] + } + ], + "source": [ + "# Calculate RMSE\n", + "RMSE = rmse(\n", + " test_df,\n", + " predictions_df\n", + ")\n", + "# Calculate MAE\n", + "MAE = mae(\n", + " test_df,\n", + " predictions_df\n", + ")\n", + "print(f\"\"\"\n", + "RMSE: {RMSE}\n", + "MAE: {MAE}\n", + "\"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pm.record(\"rmse\", RMSE)\n", + "pm.record(\"mae\", MAE)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## References\n", + "\n", + "[1] Pratik Jawanpuria, Arjun Balgovind, Anoop Kunchukuttan, Bamdev Mishra. _[Learning Multilingual Word Embeddings in Latent Metric Space: A Geometric Approach](https://www.mitpressjournals.org/doi/full/10.1162/tacl_a_00257)_. Transaction of the Association for Computational Linguistics (TACL), Volume 7, p.107-120, 2019.\n", + "\n", + "[2] Xin Dong, Lei Yu, Zhonghuo Wu, Yuxia Sun, Lingfeng Yuan, Fangxi Zhang. [A Hybrid Collaborative Filtering Model withDeep Structure for Recommender Systems](https://aaai.org/ocs/index.php/AAAI/AAAI17/paper/view/14676/13916).\n", + "Proceedings of the Thirty-First AAAI Conference on Artificial Intelligence (AAAI-17), p.1309-1315, 2017." + ] + } + ], + "metadata": { + "celltoolbar": "Tags", + "kernelspec": { + "display_name": "Python (reco)", + "language": "python", + "name": "reco_base" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.10" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/reco_utils/common/python_utils.py b/reco_utils/common/python_utils.py index ece8845994..a365618ea3 100644 --- a/reco_utils/common/python_utils.py +++ b/reco_utils/common/python_utils.py @@ -101,3 +101,18 @@ def get_top_k_scored_items(scores, top_k, sort_top_k=False): top_scores = top_scores[test_user_idx, sort_ind] return np.array(top_items), np.array(top_scores) + + +def binarize(a, threshold): + """Binarize the values. + + Args: + a (np.ndarray): Input array that needs to be binarized. + threshold (float): Threshold below which all values are set to 0, else 1. + """ + return np.where( + a > threshold, + 1.0, + 0.0 + ) + diff --git a/reco_utils/recommender/geoimc/__init__.py b/reco_utils/recommender/geoimc/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/reco_utils/recommender/geoimc/geoimc_algorithm.py b/reco_utils/recommender/geoimc/geoimc_algorithm.py new file mode 100644 index 0000000000..97caa3057d --- /dev/null +++ b/reco_utils/recommender/geoimc/geoimc_algorithm.py @@ -0,0 +1,204 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. + +""" +Module maintaining the IMC problem. +""" + +import os +import itertools +from collections import Counter, OrderedDict +import numpy as np +from sklearn.cluster import KMeans +from scipy.sparse import coo_matrix, csr_matrix, isspmatrix_csr +from numba import njit, jit, prange +from pymanopt import Problem +from pymanopt.manifolds import Stiefel, Product, PositiveDefinite, Euclidean +from pymanopt.solvers import ConjugateGradient +from pymanopt.solvers.linesearch import LineSearchBackTracking + +from IPython import embed + + +class IMCProblem(object): + """ + Implements the IMC problem. + """ + + def __init__( + self, + dataPtr, + lambda1=1e-2, + rank=10 + ): + """ Initialize parameters + + Args: + dataPtr (DataPtr): An object of which contains X, Z side features and target matrix Y. + lambda1 (uint): Regularizer. + rank (uint): rank of the U, B, V parametrization. + """ + + self.dataset = dataPtr + self.X = self.dataset.get_entity("row") + self.Z = self.dataset.get_entity("col") + self.rank = rank + self._loadTarget() + self.shape = (self.X.shape[0], self.Z.shape[0]) + self.lambda1 = lambda1 + self.nSamples = self.Y.data.shape[0] + + self.W = None + self.optima_reached = False + self.manifold = Product([ + Stiefel( + self.X.shape[1], + self.rank + ), + PositiveDefinite( + self.rank + ), + Stiefel( + self.Z.shape[1], + self.rank + ) + ]) + + + def _loadTarget(self, ): + """Loads target matrix from the dataset pointer. + """ + self.Y = self.dataset.get_data() + + + @staticmethod + @njit(nogil=True, parallel=True) + def _computeLoss_csrmatrix(a, b, cd, indices, indptr, residual_global): + """computes residual_global = a*b - cd at given indices in csr_matrix format""" + N = a.shape[0] + M = a.shape[1] + for i in prange(N): + for j in prange(indptr[i], indptr[i + 1]): + num = 0.0 + for k in range(M): + num += a[i, k] * b[k, indices[j]] + residual_global[j] = num - cd[j] + return residual_global + + + def _cost(self, params, residual_global): + """Compute the cost of GeoIMC optimization problem + + Args: + params (Iterator): An iterator containing the manifold point at which + the cost needs to be evaluated. + residual_global (csr_matrix): Residual matrix. + """ + U = params[0] + B = params[1] + V = params[2] + + regularizer = 0.5 * self.lambda1 * np.sum(B**2) + + IMCProblem._computeLoss_csrmatrix( + self.X.dot(U.dot(B)), + V.T.dot(self.Z.T), + self.Y.data, + self.Y.indices, + self.Y.indptr, + residual_global + ) + cost = 0.5 * np.sum((residual_global)**2)/self.nSamples + regularizer + + return cost + + + def _egrad(self, params, residual_global): + """Computes the euclidean gradient + + Args: + params (Iterator): An iterator containing the manifold point at which + the cost needs to be evaluated. + residual_global (csr_matrix): Residual matrix. + """ + U = params[0] + B = params[1] + V = params[2] + + residual_global_csr = csr_matrix( + (residual_global, self.Y.indices, self.Y.indptr), + shape=self.shape, + ) + + gradU = np.dot( + self.X.T, + residual_global_csr.dot(self.Z.dot(V.dot(B.T))) + )/self.nSamples + + gradB = np.dot( + (self.X.dot(U)).T, + residual_global_csr.dot(self.Z.dot(V)) + )/self.nSamples + self.lambda1 * B + gradB_sym = (gradB + gradB.T)/2 + + gradV = np.dot( + (self.X.dot(U.dot(B))).T, + residual_global_csr.dot(self.Z) + ).T/self.nSamples + + return [ + gradU, + gradB_sym, + gradV + ] + + + def solve(self, *args): + """ Main solver of the IMC model + + Args: + max_opt_time (uint): Maximum time (in secs) for optimization + max_opt_iter (uint): Maximum iterations for optimization + verbosity (uint): The level of verbosity for Pymanopt logs + """ + if self.optima_reached: + return + + self._optimize(*args) + + self.optima_reached = True + return + + + def _optimize(self, max_opt_time, max_opt_iter, verbosity): + """Optimize the GeoIMC optimization problem + + Args: The args of `solve` + """ + residual_global = np.zeros(self.Y.data.shape) + + solver = ConjugateGradient(maxtime=max_opt_time, maxiter=max_opt_iter, linesearch=LineSearchBackTracking()) + prb = Problem( + manifold=self.manifold, + cost=lambda x: self._cost( + x, + residual_global + ), + egrad=lambda z: self._egrad( + z, + residual_global + ), + verbosity=verbosity + ) + solution = solver.solve(prb, x=self.W) + self.W = [solution[0], solution[1], solution[2]] + + return self._cost(self.W, residual_global) + + + def reset(self): + """Reset the model. + """ + self.optima_reached = False + self.W = None + return diff --git a/reco_utils/recommender/geoimc/geoimc_data.py b/reco_utils/recommender/geoimc/geoimc_data.py new file mode 100644 index 0000000000..a11375e4b2 --- /dev/null +++ b/reco_utils/recommender/geoimc/geoimc_data.py @@ -0,0 +1,214 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. + +import warnings +import logging +from scipy.io import loadmat +import pandas as pd +import numpy as np +from scipy.sparse import coo_matrix, isspmatrix_csr +from sklearn.model_selection import train_test_split +from sklearn import datasets +from sklearn.preprocessing import normalize +from numba import jit, prange + +from reco_utils.common.python_utils import binarize +from .geoimc_utils import length_normalize, reduce_dims +from IPython import embed + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger("geoimc") + +class DataPtr(): + """ + Holds data and its respective indices + """ + + def __init__(self, data, entities): + """Initialize a data pointer + + Args: + data (csr_matrix): The target data matrix. + entities (Iterator): An iterator (of 2 elements (ndarray)) containing + the features of row, col entities. + """ + assert isspmatrix_csr(data) + + self.data = data + self.entities = entities + self.data_indices = None + self.entity_indices = [None, None] + + + def get_data(self): + """ + Returns: + csr_matrix: Target matrix (based on the data_indices filter) + """ + if self.data_indices is None: + return self.data + return self.data[self.data_indices] + + + def get_entity(self, of="row"): + """ Get entity + + Args: + of (str): The entity, either 'row' or 'col' + Returns: + ndarray: Entity matrix (based on the entity_indices filter) + """ + idx = 0 if of=="row" else 1 + if self.entity_indices[idx] is None: + return self.entities[idx] + return self.entities[idx][self.entity_indices[idx]] + + +class Dataset(): + """ + Base class that holds necessary (minimal) information needed + """ + + def __init__( + self, + name, + features_dim=0, + normalize=False, + target_transform='' + ): + """Initialize parameters + + Args: + name (str): Name of the dataset + features_dim (uint): Dimension of the features. If not 0, PCA is performed + on the features as the dimensionality reduction technique + normalize (bool): Normalize the features + target_transform (str): Transform the target values. Current options are + 'normalize' (Normalize the values), '' (Do nothing), 'binarize' (convert + the values using a threshold defined per dataset) + + """ + self.name = None + self.training_data = None + self.test_data = None + self.entities = None + + self.features_dim = features_dim + self.feat_normalize = normalize + self.target_transform = target_transform + + + def normalize(self): + """Normalizes the entity features + + """ + if self.feat_normalize: + for i in range(len(self.entities)): + if isspmatrix_csr(self.entities[i]): + logger.info(f"Normalizing CSR matrix") + self.entities[i] = normalize(self.entities[i]) + else: + self.entities[i] = length_normalize(self.entities[i]) + + + def generate_train_test_data(self, data, test_ratio=0.3): + """Generate train, test split. The split is performed on the row + entities. So, this essentially becomes a cold start row entity test. + + Args: + data (csr_matrix): The entire target matrix. + test_ratio (float): Ratio of test split. + + """ + self.training_data = DataPtr(data, self.entities) + self.test_data = DataPtr(data, self.entities) + + self.training_data.data_indices, self.test_data.data_indices = train_test_split( + np.array(range(0, data.shape[0])), + test_size=test_ratio, + shuffle=True, + random_state=0 + ) + self.training_data.entity_indices[0] = self.training_data.data_indices + self.test_data.entity_indices[0] = self.test_data.data_indices + + + def reduce_dims(self): + """Reduces the dimensionality of entity features. + + """ + if self.features_dim != 0: + self.entities[0] = reduce_dims(self.entities[0], self.features_dim) + self.entities[1] = reduce_dims(self.entities[1], self.features_dim) + logger.info(f"Dimensionality reduced ...") + + +class ML_100K(Dataset): + """ + Handles MovieLens-100K + """ + + def __init__(self, **kwargs): + super().__init__(self.__class__.__name__, **kwargs) + self.min_rating = 1 + self.max_rating = 5 + + + def df2coo(self, df): + """Convert the input dataframe into a coo matrix + + Args: + df (pd.DataFrame): DataFrame containing the target matrix information. + """ + data = [] + row = list(df['user id']-1) + col = list(df['item id']-1) + for idx in range(0, len(df)): + val = df['rating'].iloc[idx] + data += [val] + + if self.target_transform == 'normalize': + data = data/np.sqrt(np.sum(np.arange(self.min_rating, self.max_rating+1)**2)) + elif self.target_transform == 'binarize': + data = binarize(np.array(data), 3) + + # TODO: Get this from `u.info` + return coo_matrix((data, (row, col)), shape=(943, 1682)) + + + def _read_from_file(self, path): + """Read the traget matrix from file at path. + + Args: + path (str): Path to the target matrix + """ + df = pd.read_csv(path, delimiter='\t', names=['user id','item id','rating','timestamp'], encoding="ISO-8859-1") + df.drop(['timestamp'], axis=1, inplace=True) + return self.df2coo(df) + + + def load_data(self, path, e1_path, e2_path): + """ Load dataset + + Args: + path (str): Path to the directory containing ML100K dataset + e1_path (str): Path to the file containing row (user) features of ML100K dataset + e2_path (str): Path to the file containing col (movie) features of ML100K dataset + """ + self.entities = [self._load_features(e1_path, "userFeatures"), self._load_features(e2_path, "itemFeatures")] + self.normalize() + self.reduce_dims() + self.training_data = DataPtr(self._read_from_file(f"{path}/u1.base").tocsr(), self.entities) + self.test_data = DataPtr(self._read_from_file(f"{path}/u1.test").tocsr(), self.entities) + + + def _load_features(self, path, key): + """Load entitiy features + + Args: + path (str): Path to the file containing features. It should be a matlab file. + key (str): key containing the feature information. + + """ + data = loadmat(path) + return data[key].toarray() diff --git a/reco_utils/recommender/geoimc/geoimc_predict.py b/reco_utils/recommender/geoimc/geoimc_predict.py new file mode 100644 index 0000000000..f8a077ea35 --- /dev/null +++ b/reco_utils/recommender/geoimc/geoimc_predict.py @@ -0,0 +1,121 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. + +import numpy as np +from scipy.linalg import sqrtm +from numba import njit, jit, prange + +from IPython import embed +from .geoimc_utils import length_normalize +from reco_utils.common.python_utils import binarize as conv_binary + +class PlainScalarProduct(object): + """ + Module that implements plain scalar product + as the retrieval criterion + """ + + def __init__( + self, + X, + Y, + **kwargs + ): + """ + Args: + X: numpy matrix of shape (users, features) + Y: numpy matrix of shape (items, features) + """ + self.X = X + self.Y = Y + + + def sim(self, **kwargs): + """Calculate the similarity score + """ + sim = self.X.dot(self.Y.T) + return sim + + +class Inferer(): + """ + Holds necessary (minimal) information needed for inference + """ + + def __init__( + self, + method='dot', + k=10, + transformation='' + ): + """Initialize parameters + + Args: + method (str): The inference method. Currently 'dot' + (Dot product) is supported. + k (uint): `k` for 'topk' transformation. + transformation (str): Transform the inferred values into a + different scale. Currently 'mean' (Binarize the values + using mean of inferred matrix as the threshold), 'topk' + (Pick Top-K inferred values per row and assign them 1, + setting rest of them to 0), '' (No transformation) are + supported. + """ + self.method = self._get_method(method) + self.k = k + self.transformation = transformation + + + def _get_method(self, k): + """Get the inferer method + + Args: + k (str): The inferer name + + Returns: + class: A class object implementing the inferer 'k' + """ + if k == 'dot': + method = PlainScalarProduct + else: + raise ValueError(f"{k} is unknown.") + return method + + + def infer(self, dataPtr, W, **kwargs): + """Main inference method + + Args: + dataPtr (DataPtr): An object containing the X, Z features needed for inference + W (iterable): An iterable containing the U, B, V parametrized matrices. + """ + + if isinstance(dataPtr, list): + a = dataPtr[0] + b = dataPtr[1] + else: + a = dataPtr.get_entity("row").dot(W[0]).dot(sqrtm(W[1])) + b = dataPtr.get_entity("col").dot(W[2]).dot(sqrtm(W[1])) + + sim_score = self.method( + a, + b + ).sim(**kwargs) + + if self.transformation == 'mean': + prediction = conv_binary(sim_score, sim_score.mean()) + elif self.transformation == 'topk': + masked_sim_score = sim_score.copy() + + for i in range(sim_score.shape[0]): + topKidx = np.argpartition(masked_sim_score[i], -self.k)[-self.k:] + mask = np.ones(sim_score[i].size, dtype=bool) + mask[topKidx] = False + + masked_sim_score[i][topKidx] = 1 + masked_sim_score[i][mask] = 0 + prediction = masked_sim_score + else: + prediction = sim_score + + return prediction diff --git a/reco_utils/recommender/geoimc/geoimc_utils.py b/reco_utils/recommender/geoimc/geoimc_utils.py new file mode 100644 index 0000000000..7bb64f4d6d --- /dev/null +++ b/reco_utils/recommender/geoimc/geoimc_utils.py @@ -0,0 +1,58 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. + +import numpy as np +from sklearn.decomposition import PCA + +from reco_utils.dataset.download_utils import maybe_download +from IPython import embed + +def length_normalize(matrix): + """Length normalize the matrix + + Args: + matrix (np.ndarray): Input matrix that needs to be normalized + + Returns: + Normalized matrix + """ + norms = np.sqrt(np.sum(matrix**2, axis=1)) + norms[norms == 0] = 1 + return matrix / norms[:, np.newaxis] + + +def mean_center(matrix): + """Performs mean centering across axis 0 + + Args: + matrix (np.ndarray): Input matrix that needs to be mean centered + """ + avg = np.mean(matrix, axis=0) + matrix -= avg + + +def reduce_dims(matrix, target_dim): + """Reduce dimensionality of the data using PCA. + + Args: + matrix (np.ndarray): Matrix of the form (n_sampes, n_features) + target_dim (uint): Dimension to which n_features should be reduced to. + + """ + model = PCA(n_components=target_dim) + model.fit(matrix) + return model.transform(matrix) + + +def download_geoimc_features(remote_base_url, remote_filenames, dest): + """A small utility to download features + + Args: + remote_base_url (url): Base URL at which features are present. + remote_filenames (Iterator): An iterator (of 2 elements, in general) containing + the filenames of row, col features at the remote_base_url. + dest (str): The destination of these downloaded files (Destination dir should already be + created). + """ + for _remote_fname in remote_filenames: + maybe_download(f"{remote_base_url}/{_remote_fname}", f"{dest}/{_remote_fname}") diff --git a/tests/conftest.py b/tests/conftest.py index 1b5073b850..4c2844618c 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -225,6 +225,9 @@ def notebooks(): "rlrmc_quickstart": os.path.join( folder_notebooks, "00_quick_start", "rlrmc_movielens.ipynb" ), + "geoimc_quickstart": os.path.join( + folder_notebooks, "00_quick_start", "geoimc_movielens.ipynb" + ), "data_split": os.path.join( folder_notebooks, "01_prepare_data", "data_split.ipynb" ), diff --git a/tests/integration/test_notebooks_python.py b/tests/integration/test_notebooks_python.py index d1823ff577..5eba0084d3 100644 --- a/tests/integration/test_notebooks_python.py +++ b/tests/integration/test_notebooks_python.py @@ -218,3 +218,28 @@ def test_xlearn_fm_integration(notebooks): results = pm.read_notebook(OUTPUT_NOTEBOOK).dataframe.set_index("name")["value"] assert results["auc_score"] == pytest.approx(0.75, rel=TOL, abs=ABS_TOL) + + +@pytest.mark.integration +@pytest.mark.parametrize( + "expected_values", + [ + ( + { + "rmse": 0.4969, + "mae": 0.4761 + } + ) + ], +) +def test_geoimc_integration(notebooks, expected_values): + notebook_path = notebooks["geoimc_quickstart"] + pm.execute_notebook( + notebook_path, + OUTPUT_NOTEBOOK, + kernel_name=KERNEL_NAME + ) + results = pm.read_notebook(OUTPUT_NOTEBOOK).dataframe.set_index("name")["value"] + + for key, value in expected_values.items(): + assert results[key] == pytest.approx(value, rel=TOL, abs=ABS_TOL) diff --git a/tests/unit/test_geoimc.py b/tests/unit/test_geoimc.py new file mode 100644 index 0000000000..6b7f02cdf2 --- /dev/null +++ b/tests/unit/test_geoimc.py @@ -0,0 +1,147 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. + +import itertools +import collections +import pytest +import numpy as np +import pandas as pd +from scipy.sparse import csr_matrix +from pandas.util.testing import assert_frame_equal + +from reco_utils.common.python_utils import binarize +from reco_utils.recommender.geoimc.geoimc_data import DataPtr +from reco_utils.recommender.geoimc.geoimc_predict import PlainScalarProduct, Inferer +from reco_utils.recommender.geoimc.geoimc_algorithm import IMCProblem +from reco_utils.recommender.geoimc.geoimc_utils import length_normalize, mean_center, reduce_dims +from pymanopt.manifolds import Stiefel, PositiveDefinite + +_IMC_TEST_DATA = [ +( + csr_matrix( + np.array([[1, 5, 3], [7, 2, 1]]) + ), + [np.array([[0, 6, 0, 5], [7, 1, 2, 1]]), + np.array([[8, 8, 0, 8, 4], [7, 4, 3, 0, 7], [0, 6, 8, 7, 2]]) + ] +), +( + csr_matrix( + np.array([[8, 6, 0], [6, 6, 2]]) + ), + [np.array([[3, 7, 5, 7, 6], [6, 5, 6, 8, 1]]), + np.array([[5, 3, 2, 7, 6], [2, 6, 9, 3, 3], [9, 9, 4, 9, 2]]) + ] +)] + +# `geoimc_data` tests +@pytest.mark.parametrize( + "data, entities", + _IMC_TEST_DATA +) +def test_dataptr(data, entities): + ptr = DataPtr(data, entities) + assert (ptr.get_data() != data).nnz == 0 + assert np.array_equal(ptr.get_entity("row"), entities[0]) + assert np.array_equal(ptr.get_entity("col"), entities[1]) + + +# `geoimc_utils` tests +@pytest.mark.parametrize( + "matrix", + [ + (np.array([[3, 5, 6], [2, 7, 0],[0, 5, 2]])), + (np.array([[7, 9, 9], [4, 3, 8],[6, 0, 3]])) + ] +) +def test_length_normalize(matrix): + assert np.allclose( + np.sqrt(np.sum(length_normalize(matrix)**2, axis=1)), + np.ones(matrix.shape[0]), + atol=1e-6 + ) + + +@pytest.mark.parametrize( + "matrix", + [ + (np.array([[3, 5, 6], [2, 7, 0],[0, 5, 2]], dtype='float64')), + (np.array([[7, 9, 9], [4, 3, 8],[6, 0, 3]], dtype='float64')) + ] +) +def test_mean_center(matrix): + mean_center(matrix) + assert np.allclose( + np.mean(matrix, axis=0), + np.zeros(matrix.shape[1], dtype='float64'), + atol=1e-10 + ) + + +def test_reduce_dims(): + matrix = np.random.rand(100, 100) + assert reduce_dims(matrix, 50).shape[1] == 50 + + +# `geoimc_algorithm` tests +@pytest.mark.parametrize( + "dataPtr, rank", + [ + (DataPtr(_IMC_TEST_DATA[0][0], _IMC_TEST_DATA[0][1]), 3), + (DataPtr(_IMC_TEST_DATA[1][0], _IMC_TEST_DATA[1][1]), 3), + ] +) +def test_imcproblem(dataPtr, rank): + + # Test init + prblm = IMCProblem(dataPtr,rank=rank) + assert np.array_equal(prblm.X, dataPtr.get_entity("row")) + assert np.array_equal(prblm.Z, dataPtr.get_entity("col")) + assert (prblm.Y != dataPtr.get_data()).nnz == 0 + assert prblm.rank == rank + assert prblm.lambda1 == 1e-2 + assert prblm.W is None + assert prblm.optima_reached == False + + # Test solve + prblm.solve(10, 10, 0) + assert len(prblm.W) == 3 + assert prblm.optima_reached + + # Test reset + prblm.reset() + assert prblm.W is None + assert prblm.optima_reached == False + + +# `geoimc_predict` tests +def test_inferer_init(): + assert Inferer(method='dot').method.__name__ == 'PlainScalarProduct' + + +@pytest.mark.parametrize( + "dataPtr", + [ + DataPtr(_IMC_TEST_DATA[0][0], _IMC_TEST_DATA[0][1]), + DataPtr(_IMC_TEST_DATA[1][0], _IMC_TEST_DATA[1][1]), + ] +) +def test_inferer_infer(dataPtr): + test_data = dataPtr + + rowFeatureDim = test_data.get_entity("row").shape[1] + colFeatureDim = test_data.get_entity("col").shape[1] + rank = 2 + W = [Stiefel(rowFeatureDim, rank).rand(), PositiveDefinite(rank).rand(), Stiefel(colFeatureDim, rank).rand()] + + Inferer(method='dot').infer(test_data, W) + + inference = Inferer(method='dot', transformation='mean').infer(test_data, W) + nOccurences = collections.Counter(inference.ravel()) + assert nOccurences[0]+nOccurences[1] == inference.size + + k = 2 + inference = Inferer(method='dot', k=k, transformation='topk').infer(test_data, W) + nOccurences = collections.Counter(inference.ravel()) + assert nOccurences[0]+nOccurences[1] == inference.size + assert np.max(np.count_nonzero(inference == 1, axis=0)) <= k diff --git a/tests/unit/test_python_utils.py b/tests/unit/test_python_utils.py index f14356779a..c1686c6102 100644 --- a/tests/unit/test_python_utils.py +++ b/tests/unit/test_python_utils.py @@ -10,6 +10,7 @@ jaccard, lift, get_top_k_scored_items, + binarize ) TOL = 0.0001 @@ -94,3 +95,18 @@ def test_get_top_k_scored_items(): assert np.array_equal(top_items, np.array([[4, 3, 2], [0, 1, 2], [1, 3, 2]])) assert np.array_equal(top_scores, np.array([[5, 4, 3], [5, 4, 3], [5, 4, 3]])) + + +def test_binarize(): + data = np.array( + [[2, 7, 0], + [8, 2, 9], + [9, 9, 4]] + ) + threshold = 3 + expected = np.array( + [[0, 1, 0], + [1, 0, 1], + [1, 1, 1]] + ) + assert np.array_equal(binarize(data, threshold), expected)