{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import matplotlib.pyplot as plt\n", "from mpl_toolkits.mplot3d import Axes3D\n", "import seaborn as sns\n", "sns.set()\n", "%matplotlib inline" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "w = np.random.randn(500, 2)\n", "plt.figure(figsize=(6, 6))\n", "plt.scatter(w[:,0], w[:,1])\n", "plt.axis('equal')\n", "plt.show()\n", "w = np.random.randn(500, 3)\n", "fig = plt.figure(figsize=(6, 6))\n", "ax = fig.add_subplot(111, projection='3d')\n", "ax.scatter(w[:,0], w[:,1], w[:,2])\n", "ax.axis('equal')\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "dims = [1, 3, 5, 10, 30, 100]\n", "\n", "def norms(x):\n", " return np.sqrt(np.sum(np.square(x), axis=1))\n", "\n", "# Distribution of ||x||\n", "plt.figure()\n", "for d in dims:\n", " x = np.random.randn(1000, d)\n", " sns.distplot(norms(x), kde_kws={'label' : 'dim={0:d}'.format(d)})\n", " plt.xlabel('Norm')\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def gen_gaussian_clusters(mu0, dim, n_samples):\n", " \"\"\"\n", " Generate three Gaussian clusters centered at:\n", " (mu0[0], 0, 0, ...)\n", " (mu0[1], 0, 0, ...)\n", " (mu0[2], 0, 0, ...)\n", " :param mu0: Determines the center of the clusters.\n", " :param dim: Dimension of the Gaussian samples.\n", " :param n_samples: Number of samples to generate for each cluster.\n", " \"\"\"\n", " mu = np.zeros((3, dim))\n", " mu[:,0] = mu0\n", " x1 = np.random.randn(n_samples, dim) + mu[0,:]\n", " x2 = np.random.randn(n_samples, dim) + mu[1,:]\n", " x3 = np.random.randn(n_samples, dim) + mu[2,:]\n", " return x1, x2, x3\n", "\n", "dim = 500\n", "n_samples = 500\n", "mu0 = np.array([0, 5, 10])\n", "\n", "x1, x2, x3 = gen_gaussian_clusters(mu0, dim, n_samples)\n", "\n", "# Visualize if possible\n", "plt.figure()\n", "if dim == 1:\n", " plt.hist(x1)\n", " plt.hist(x2)\n", " plt.hist(x3)\n", "elif dim >= 2:\n", " plt.scatter(x1[:,0], x1[:,1])\n", " plt.scatter(x2[:,0], x2[:,1])\n", " plt.scatter(x3[:,0], x3[:,1])\n", " plt.axis('equal')\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Plot the distribution of distances for various # of dimensions\n", "dims = [3, 10, 50, 100, 200, 500]\n", "\n", "# Distance distribution\n", "plt.figure(1, figsize=(15, 10))\n", "for i, d in enumerate(dims, 1):\n", " x1, x2, x3 = gen_gaussian_clusters(mu0, d, n_samples)\n", " plt.subplot(2, 3, i)\n", " sns.distplot(norms(x1))\n", " sns.distplot(norms(x2))\n", " sns.distplot(norms(x3))\n", " plt.legend(['x1', 'x2', 'x3'])\n", " plt.xlabel('Distance to origin')\n", " plt.title('dim = {0:d}'.format(d))\n", " plt.xlim(0, plt.xlim()[1])\n", "plt.show()\n", "\n", "from scipy.spatial.distance import pdist, squareform\n", "\n", "# Distance matrix\n", "plt.figure(figsize=(16, 10))\n", "for i, d in enumerate(dims, 1):\n", " x1, x2, x3 = gen_gaussian_clusters(mu0, d, n_samples)\n", " plt.subplot(2, 3, i)\n", " D = squareform(pdist(np.concatenate((x1, x2, x3), axis=0), 'euclidean'))\n", " sns.heatmap(D)\n", " plt.title('dim = {0:d}'.format(d))\n", "plt.show()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.4" } }, "nbformat": 4, "nbformat_minor": 2 }