!pip install tensorflow
!pip install fancyimpute
import tensorflow as tf # fancyimpute uses tensorflow, we’ll explicitly load it so that’s clear
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import sklearn as sk
from fancyimpute import SimpleFill, KNN, MatrixFactorization
%matplotlib inline
plt.style.use(“ggplot”)
tf.random.set_seed(0)
np.random.seed(0)
# Create the data matrix X
# Select n as the number of users
n = 100
# Select m as the number of movies or products
m = 50
# Select the inner_rank as the number of real genres or categories that movies or products belong to
inner_rank = 10
user_matrix = np.random.randn(n, inner_rank)
item_matrix = np.random.randn(inner_rank, m)
# X = np.dot(np.random.randn(n, inner_rank), np.random.randn(inner_rank, m))
# X = np.dot(user_matrix, item_matrix)
X = user_matrix @ item_matrix
visible_percentage = 0.6
missing_mask = np.random.rand(*X.shape) < (1 - visible_percentage)
X_incomplete = X.copy()
# missing entries indicated with NaN
X_incomplete[missing_mask] = np.nan
meanFill = SimpleFill("mean")
X_filled_mean = meanFill.fit_transform(X_incomplete)
f, (ax1, ax2) = plt.subplots(1, 2, figsize=(10,6))
ax1.imshow(X)
ax1.set_title("Original Matrix")
ax1.grid(False)
ax2.imshow(X_filled_mean)
ax2.set_title("Mean Fill Completed Matrix")
ax2.grid(False)
plt.show()
# To test the performance of our matrix completion algorithm we want to compare
# the "filled-in" values to the original:
def mat_completion_mse(X_filled, X_truth, missing_mask):
"""Calculates the mean squared error of the filled in values vs. the truth
Args:
X_filled (np.ndarray): The "filled-in" matrix from a matrix completion algorithm
X_truth (np.ndarray): The true filled in matrix
missing_mask (np.ndarray): Boolean array of missing values
Returns:
float: Mean squares error of the filled values
"""
return ((X_filled[missing_mask] - X_truth[missing_mask]) ** 2).mean()
meanFill_mse = mat_completion_mse(X_filled_mean, X, missing_mask)
# Find the best value for k
def find_best_k(k_neighbors, complete_mat, incomplete_mat, missing_mask):
"""Determines the best k to use for matrix completion with KNN
Args:
k_neighbors (iterable): The list of k's to try
complete_mat (np.ndarray): The original matrix with complete values
incomplete_mat (np.ndarray): The matrix with missing values
missing_mask (np.ndarray): Boolean array of missing values
Returns:
integer: the best value of k to use for that particular matrix
"""
best_k = -1
best_k_mse = np.infty
for neighbors in k_neighbors:
# YOUR CODE HERE
raise NotImplementedError()
return best_k
k_neighbors = [2, 3, 4, 5, 10, 20]
best_k = find_best_k(k_neighbors, X, X_incomplete, missing_mask)
# Run KNN with the best_k and store the result in X_filled_knn
# YOUR CODE HERE
knnFill_mse = mat_completion_mse(X_filled_knn, X, missing_mask)
print("knnFill MSE: %f" % knnFill_mse)