########################################### # Suppress matplotlib user warnings # Necessary for newer version of matplotlib import warnings warnings.filterwarnings("ignore", category = UserWarning, module = "matplotlib") # # Display inline matplotlib plots with IPython from IPython import get_ipython get_ipython().run_line_magic('matplotlib', 'inline') ########################################### import matplotlib.pyplot as plt import matplotlib.cm as cm import pandas as pd import numpy as np def pca_results(good_data, pca): ''' Create a DataFrame of the PCA results Includes dimension feature weights and explained variance Visualizes the PCA results ''' # Dimension indexing dimensions = dimensions = ['Dimension {}'.format(i) for i in range(1,len(pca.components_)+1)] # PCA components components = pd.DataFrame(np.round(pca.components_, 4), columns = good_data.keys()) components.index = dimensions # PCA explained variance ratios = pca.explained_variance_ratio_.reshape(len(pca.components_), 1) variance_ratios = pd.DataFrame(np.round(ratios, 4), columns = ['Explained Variance']) variance_ratios.index = dimensions # Create a bar plot visualization fig, ax = plt.subplots(figsize = (14,8)) # Plot the feature weights as a function of the components components.plot(ax = ax, kind = 'bar'); ax.set_ylabel("Feature Weights") ax.set_xticklabels(dimensions, rotation=0) # Display the explained variance ratios for i, ev in enumerate(pca.explained_variance_ratio_): ax.text(i-0.40, ax.get_ylim()[1] + 0.05, "Explained Variance\n %.4f"%(ev)) # Return a concatenated DataFrame return pd.concat([variance_ratios, components], axis = 1) def cluster_results(reduced_data, preds, centers, pca_samples): ''' Visualizes the PCA-reduced cluster data in two dimensions Adds cues for cluster centers and student-selected sample data ''' predictions = pd.DataFrame(preds, columns = ['Cluster']) plot_data = pd.concat([predictions, reduced_data], axis = 1) # Generate the cluster plot fig, ax = plt.subplots(figsize = (14,8)) # Color map cmap = cm.get_cmap('gist_rainbow') # Color the points based on assigned cluster for i, cluster in plot_data.groupby('Cluster'): cluster.plot(ax = ax, kind = 'scatter', x = 'Dimension 1', y = 'Dimension 2', \ color = cmap((i)*1.0/(len(centers)-1)), label = 'Cluster %i'%(i), s=30); # Plot centers with indicators for i, c in enumerate(centers): ax.scatter(x = c[0], y = c[1], color = 'white', edgecolors = 'black', \ alpha = 1, linewidth = 2, marker = 'o', s=200); ax.scatter(x = c[0], y = c[1], marker='$%d$'%(i), alpha = 1, s=100); # Plot transformed sample points ax.scatter(x = pca_samples[:,0], y = pca_samples[:,1], \ s = 150, linewidth = 4, color = 'black', marker = 'x'); # Set plot title ax.set_title("Cluster Learning on PCA-Reduced Data - Centroids Marked by Number\nTransformed Sample Data Marked by Black Cross"); def biplot(good_data, reduced_data, pca): ''' Produce a biplot that shows a scatterplot of the reduced data and the projections of the original features. good_data: original data, before transformation. Needs to be a pandas dataframe with valid column names reduced_data: the reduced data (the first two dimensions are plotted) pca: pca object that contains the components_ attribute return: a matplotlib AxesSubplot object (for any additional customization) This procedure is inspired by the script: https://github.com/teddyroland/python-biplot ''' fig, ax = plt.subplots(figsize = (14,8)) # scatterplot of the reduced data ax.scatter(x=reduced_data.loc[:, 'Dimension 1'], y=reduced_data.loc[:, 'Dimension 2'], facecolors='b', edgecolors='b', s=70, alpha=0.5) feature_vectors = pca.components_.T # we use scaling factors to make the arrows easier to see arrow_size, text_pos = 7.0, 8.0, # projections of the original features for i, v in enumerate(feature_vectors): ax.arrow(0, 0, arrow_size*v[0], arrow_size*v[1], head_width=0.2, head_length=0.2, linewidth=2, color='red') ax.text(v[0]*text_pos, v[1]*text_pos, good_data.columns[i], color='black', ha='center', va='center', fontsize=18) ax.set_xlabel("Dimension 1", fontsize=14) ax.set_ylabel("Dimension 2", fontsize=14) ax.set_title("PC plane with original feature projections.", fontsize=16); return ax def channel_results(reduced_data, outliers, pca_samples): ''' Visualizes the PCA-reduced cluster data in two dimensions using the full dataset Data is labeled by "Channel" and cues added for student-selected sample data ''' # Check that the dataset is loadable try: full_data = pd.read_csv("customers.csv") except: print("Dataset could not be loaded. Is the file missing?") return False # Create the Channel DataFrame channel = pd.DataFrame(full_data['Channel'], columns = ['Channel']) channel = channel.drop(channel.index[outliers]).reset_index(drop = True) labeled = pd.concat([reduced_data, channel], axis = 1) # Generate the cluster plot fig, ax = plt.subplots(figsize = (14,8)) # Color map cmap = cm.get_cmap('gist_rainbow') # Color the points based on assigned Channel labels = ['Hotel/Restaurant/Cafe', 'Retailer'] grouped = labeled.groupby('Channel') for i, channel in grouped: channel.plot(ax = ax, kind = 'scatter', x = 'Dimension 1', y = 'Dimension 2', \ color = cmap((i-1)*1.0/2), label = labels[i-1], s=30); # Plot transformed sample points for i, sample in enumerate(pca_samples): ax.scatter(x = sample[0], y = sample[1], \ s = 200, linewidth = 3, color = 'black', marker = 'o', facecolors = 'none'); ax.scatter(x = sample[0]+0.25, y = sample[1]+0.3, marker='$%d$'%(i), alpha = 1, s=125); # Set plot title ax.set_title("PCA-Reduced Data Labeled by 'Channel'\nTransformed Sample Data Circled");