{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Scientifics Packages\n",
    "Gonzalo Rios (grios@dim.uchile.cl)\n",
    "\n",
    "http://scikits.appspot.com/scikits\n",
    "\n",
    "https://www.scipy.org/topical-software.html"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Sympy: Symbolic Mathematics in Python\n",
    "http://www.sympy.org"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T17:35:03.899222Z",
     "start_time": "2018-11-08T17:35:03.530969Z"
    }
   },
   "outputs": [],
   "source": [
    "import sympy as sy\n",
    "x, y = sy.symbols('x y')\n",
    "x, y"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T17:35:18.708308Z",
     "start_time": "2018-11-08T17:35:18.701580Z"
    }
   },
   "outputs": [],
   "source": [
    "expr = x + 2*y\n",
    "expr"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T17:35:29.373796Z",
     "start_time": "2018-11-08T17:35:29.367747Z"
    }
   },
   "outputs": [],
   "source": [
    "expr + 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T17:35:31.704670Z",
     "start_time": "2018-11-08T17:35:31.693832Z"
    }
   },
   "outputs": [],
   "source": [
    "expr + 1 - x"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T17:35:49.349256Z",
     "start_time": "2018-11-08T17:35:49.345237Z"
    }
   },
   "outputs": [],
   "source": [
    "x*expr"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T17:35:56.684739Z",
     "start_time": "2018-11-08T17:35:56.678400Z"
    }
   },
   "outputs": [],
   "source": [
    "sy.expand(x*expr)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T17:36:07.805939Z",
     "start_time": "2018-11-08T17:36:07.783355Z"
    }
   },
   "outputs": [],
   "source": [
    "sy.factor(sy.expand(x*expr))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T17:36:27.403448Z",
     "start_time": "2018-11-08T17:36:27.396924Z"
    }
   },
   "outputs": [],
   "source": [
    "sy.diff(sy.sin(x)*sy.exp(x), x)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T17:37:00.562619Z",
     "start_time": "2018-11-08T17:37:00.185132Z"
    }
   },
   "outputs": [],
   "source": [
    "sy.integrate(sy.exp(x)*sy.sin(x) + sy.exp(x)*sy.cos(x), x)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T17:37:23.719571Z",
     "start_time": "2018-11-08T17:37:23.674013Z"
    }
   },
   "outputs": [],
   "source": [
    "r = sy.limit(sy.sin(x)/x, x, 0)\n",
    "r"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T17:37:34.527393Z",
     "start_time": "2018-11-08T17:37:34.507056Z"
    }
   },
   "outputs": [],
   "source": [
    "sy.solve(x**2 - 2, x)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T17:37:57.331434Z",
     "start_time": "2018-11-08T17:37:57.325689Z"
    }
   },
   "outputs": [],
   "source": [
    "sy.Integral(sy.cos(x)**2, (x, 0, sy.pi))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T17:38:11.981472Z",
     "start_time": "2018-11-08T17:38:11.977311Z"
    }
   },
   "outputs": [],
   "source": [
    "sy.pprint(sy.Integral(sy.cos(x)**2, (x, 0, sy.pi)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T17:38:31.524171Z",
     "start_time": "2018-11-08T17:38:31.515698Z"
    }
   },
   "outputs": [],
   "source": [
    "print(sy.latex(sy.Integral(sy.cos(x)**2, (x, 0, sy.pi))))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T17:38:35.950664Z",
     "start_time": "2018-11-08T17:38:35.942899Z"
    }
   },
   "outputs": [],
   "source": [
    "%%latex\n",
    "$\\int_{0}^{\\pi} \\cos^{2}{\\left (x \\right )}\\, dx$"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T17:38:55.590372Z",
     "start_time": "2018-11-08T17:38:55.573481Z"
    }
   },
   "outputs": [],
   "source": [
    "sy.Integral(sy.cos(x)**2, (x, 0, sy.pi)).evalf()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T17:39:13.305995Z",
     "start_time": "2018-11-08T17:39:13.301251Z"
    }
   },
   "outputs": [],
   "source": [
    "expr = sy.diff(sy.sin(x)*sy.exp(x), x)\n",
    "expr"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T17:39:31.767547Z",
     "start_time": "2018-11-08T17:39:31.763212Z"
    }
   },
   "outputs": [],
   "source": [
    "expr.evalf(subs={x: 3.14})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T17:40:03.995871Z",
     "start_time": "2018-11-08T17:40:03.809186Z"
    }
   },
   "outputs": [],
   "source": [
    "f = sy.lambdify(x, expr)\n",
    "f"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T17:40:08.505977Z",
     "start_time": "2018-11-08T17:40:08.502772Z"
    }
   },
   "outputs": [],
   "source": [
    "f(3.14)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T17:40:27.743742Z",
     "start_time": "2018-11-08T17:40:27.734482Z"
    }
   },
   "outputs": [],
   "source": [
    "fnp = sy.lambdify(x, expr, 'numpy')\n",
    "fnp"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T17:40:33.166953Z",
     "start_time": "2018-11-08T17:40:33.161863Z"
    }
   },
   "outputs": [],
   "source": [
    "fnp(3.14)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T17:40:48.833113Z",
     "start_time": "2018-11-08T17:40:46.040399Z"
    }
   },
   "outputs": [],
   "source": [
    "%timeit expr.evalf(subs={x: 3.14})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T17:40:51.134217Z",
     "start_time": "2018-11-08T17:40:48.835435Z"
    }
   },
   "outputs": [],
   "source": [
    "%timeit f(3.14)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T17:40:54.012299Z",
     "start_time": "2018-11-08T17:40:51.135967Z"
    }
   },
   "outputs": [],
   "source": [
    "%timeit fnp(3.14)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T17:41:22.934874Z",
     "start_time": "2018-11-08T17:41:22.927894Z"
    }
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "x_test = np.random.randn(1000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T17:41:30.990565Z",
     "start_time": "2018-11-08T17:41:25.618306Z"
    }
   },
   "outputs": [],
   "source": [
    "%timeit fnp(x_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T17:41:34.440993Z",
     "start_time": "2018-11-08T17:41:30.992122Z"
    }
   },
   "outputs": [],
   "source": [
    "%timeit np.array([f(i) for i in x_test])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# StatsModels: Statistic in Python\n",
    "http://www.statsmodels.org/stable/"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T17:43:46.583020Z",
     "start_time": "2018-11-08T17:43:45.348545Z"
    },
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import statsmodels.api as sm\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sb\n",
    "%matplotlib inline\n",
    "\n",
    "sb.set(context='notebook', rc={\"figure.figsize\": (20, 8)})\n",
    "\n",
    "nobs = 10000\n",
    "X = np.random.random((nobs, 2))\n",
    "X = sm.add_constant(X)\n",
    "X"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T17:45:06.676118Z",
     "start_time": "2018-11-08T17:45:06.227995Z"
    }
   },
   "outputs": [],
   "source": [
    "beta = [2.2, .3, .8]\n",
    "e = np.random.randn(nobs)*0.2\n",
    "\n",
    "y = np.dot(X, beta) + e\n",
    "plt.plot(y, 'ro')\n",
    "plt.show()\n",
    "plt.plot(X[:, 0], 'r', label=\"0\")\n",
    "plt.plot(X[:, 1], 'bo', label=\"1\")\n",
    "plt.plot(X[:, 2], 'g*', label=\"2\")\n",
    "plt.legend()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T17:46:00.751773Z",
     "start_time": "2018-11-08T17:46:00.729248Z"
    }
   },
   "outputs": [],
   "source": [
    "results = sm.OLS(y, X).fit()\n",
    "results.summary()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T17:47:26.050918Z",
     "start_time": "2018-11-08T17:47:22.741061Z"
    }
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import statsmodels.formula.api as smf\n",
    "\n",
    "try:\n",
    "    dat = sm.datasets.get_rdataset(\"Guerry\", \"HistData\").data\n",
    "    dat.to_hdf('Guerry.h5', key='HistData', mode='w')\n",
    "    print('Load dataset from R')\n",
    "except:\n",
    "    dat = pd.read_hdf('Guerry.h5')\n",
    "    print('Load dataset from HDF file')\n",
    "dat"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T17:48:33.345657Z",
     "start_time": "2018-11-08T17:48:33.229969Z"
    }
   },
   "outputs": [],
   "source": [
    "dat.Lottery.plot(style='ro')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T17:49:38.041610Z",
     "start_time": "2018-11-08T17:49:37.998645Z"
    }
   },
   "outputs": [],
   "source": [
    "results = smf.ols('Lottery ~ Literacy + Infanticide**2 + np.log(Pop1831)', data=dat).fit()\n",
    "results.summary()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T17:51:25.689879Z",
     "start_time": "2018-11-08T17:51:25.686247Z"
    }
   },
   "outputs": [],
   "source": [
    "results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T17:51:28.750555Z",
     "start_time": "2018-11-08T17:51:28.746479Z"
    }
   },
   "outputs": [],
   "source": [
    "print('R2: ', results.rsquared)\n",
    "print('Parameters: ', results.params)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Statistics Tests"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T17:52:25.137875Z",
     "start_time": "2018-11-08T17:52:24.890415Z"
    }
   },
   "outputs": [],
   "source": [
    "results.resid.hist(bins=100)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T17:53:16.274308Z",
     "start_time": "2018-11-08T17:53:16.258706Z"
    }
   },
   "outputs": [],
   "source": [
    "from statsmodels.compat import lzip\n",
    "import statsmodels.stats.api as sms\n",
    "\n",
    "name = ['Jarque-Bera', 'Chi^2 two-tail prob.', 'Skew', 'Kurtosis']\n",
    "test = sms.jarque_bera(results.resid)\n",
    "lzip(name, test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T17:53:39.016483Z",
     "start_time": "2018-11-08T17:53:39.011277Z"
    }
   },
   "outputs": [],
   "source": [
    "name = ['Chi^2', 'Two-tail probability']\n",
    "test = sms.omni_normtest(results.resid)\n",
    "lzip(name, test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T17:53:58.154214Z",
     "start_time": "2018-11-08T17:53:58.141387Z"
    }
   },
   "outputs": [],
   "source": [
    "name = ['Lagrange multiplier statistic', 'p-value', 'f-value', 'f p-value']\n",
    "test = sms.het_breushpagan(results.resid, results.model.exog)\n",
    "lzip(name, test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T17:54:04.314049Z",
     "start_time": "2018-11-08T17:54:04.306469Z"
    }
   },
   "outputs": [],
   "source": [
    "name = ['F statistic', 'p-value']\n",
    "test = sms.het_goldfeldquandt(results.resid, results.model.exog)\n",
    "lzip(name, test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T17:54:06.448338Z",
     "start_time": "2018-11-08T17:54:06.438435Z"
    }
   },
   "outputs": [],
   "source": [
    "name = ['t value', 'p value']\n",
    "test = sms.linear_harvey_collier(results)\n",
    "lzip(name, test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T17:54:13.698572Z",
     "start_time": "2018-11-08T17:54:13.690270Z"
    }
   },
   "outputs": [],
   "source": [
    "np.linalg.cond(results.model.exog)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T17:54:29.099781Z",
     "start_time": "2018-11-08T17:54:28.919522Z"
    }
   },
   "outputs": [],
   "source": [
    "from statsmodels.graphics.regressionplots import plot_leverage_resid2\n",
    "\n",
    "_ = plot_leverage_resid2(results)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Non-Linear Regression"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T17:56:01.839623Z",
     "start_time": "2018-11-08T17:56:01.738823Z"
    }
   },
   "outputs": [],
   "source": [
    "nsample = 50\n",
    "sig = 0.5\n",
    "x = np.linspace(0, 20, nsample)\n",
    "X = np.column_stack((x, np.sin(x), (x-5)**2, np.ones(nsample)))\n",
    "beta = [0.5, 0.5, -0.02, 5.]\n",
    "\n",
    "y_true = np.dot(X, beta)\n",
    "y = y_true + sig * np.random.normal(size=nsample)\n",
    "plt.plot(y, 'ro')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T17:56:22.820250Z",
     "start_time": "2018-11-08T17:56:22.803991Z"
    }
   },
   "outputs": [],
   "source": [
    "res = sm.OLS(y, X).fit()\n",
    "res.summary()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T17:57:03.575614Z",
     "start_time": "2018-11-08T17:57:03.571363Z"
    }
   },
   "outputs": [],
   "source": [
    "print('Parameters: ', res.params)\n",
    "print('Standard errors: ', res.bse)\n",
    "print('Predicted values: ', res.predict())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T17:57:53.241373Z",
     "start_time": "2018-11-08T17:57:53.104650Z"
    }
   },
   "outputs": [],
   "source": [
    "from statsmodels.sandbox.regression.predstd import wls_prediction_std\n",
    "\n",
    "prstd, iv_l, iv_u = wls_prediction_std(res)\n",
    "\n",
    "plt.plot(y, 'o', label=\"data\")\n",
    "plt.plot(y_true, 'b-', label=\"True\")\n",
    "plt.plot(res.fittedvalues, 'r--.', label=\"OLS\")\n",
    "plt.plot(iv_u, 'r--')\n",
    "plt.plot(iv_l, 'r--')\n",
    "plt.legend(loc='best')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-09-06T20:16:22.381177Z",
     "start_time": "2017-09-06T20:16:22.375969Z"
    }
   },
   "source": [
    "## Heteroscedasticity"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T17:58:41.425846Z",
     "start_time": "2018-11-08T17:58:41.419085Z"
    }
   },
   "outputs": [],
   "source": [
    "nsample = 50\n",
    "x = np.linspace(0, 20, nsample)\n",
    "X = np.column_stack((x, (x - 5)**2))\n",
    "X = sm.add_constant(X)\n",
    "beta = [5., 0.5, -0.01]\n",
    "sig = 0.5\n",
    "w = np.ones(nsample)\n",
    "w[nsample * 6//10:] = 3\n",
    "y_true = np.dot(X, beta)\n",
    "e = np.random.normal(size=nsample)\n",
    "y = y_true + sig * w * e \n",
    "X = X[:,[0,1]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T17:59:01.554727Z",
     "start_time": "2018-11-08T17:59:01.421397Z"
    }
   },
   "outputs": [],
   "source": [
    "plt.plot(y, '*')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T18:00:39.426814Z",
     "start_time": "2018-11-08T18:00:39.411207Z"
    }
   },
   "outputs": [],
   "source": [
    "mod_wls = sm.WLS(y, X, weights=1./w)\n",
    "res_wls = mod_wls.fit()\n",
    "res_wls.summary()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T18:00:54.027411Z",
     "start_time": "2018-11-08T18:00:54.013512Z"
    }
   },
   "outputs": [],
   "source": [
    "res_ols = sm.OLS(y, X).fit()\n",
    "res_ols.summary()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T18:01:11.791916Z",
     "start_time": "2018-11-08T18:01:11.780312Z"
    }
   },
   "outputs": [],
   "source": [
    "print('OLS', res_ols.params)\n",
    "print('WLS', res_wls.params)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T18:01:21.602371Z",
     "start_time": "2018-11-08T18:01:21.432660Z"
    }
   },
   "outputs": [],
   "source": [
    "prstd_ols, iv_l_ols, iv_u_ols = wls_prediction_std(res_ols)\n",
    "prstd, iv_l, iv_u = wls_prediction_std(res_wls)\n",
    "\n",
    "fig, ax = plt.subplots(figsize=(8,6))\n",
    "ax.plot(x, y, 'o', label=\"Data\")\n",
    "ax.plot(x, y_true, 'b-', label=\"True\")\n",
    "# OLS\n",
    "ax.plot(x, res_ols.fittedvalues, 'r--')\n",
    "ax.plot(x, iv_u_ols, 'r--', label=\"OLS\")\n",
    "ax.plot(x, iv_l_ols, 'r--')\n",
    "# WLS\n",
    "ax.plot(x, res_wls.fittedvalues, 'g--.')\n",
    "ax.plot(x, iv_u, 'g--', label=\"WLS\")\n",
    "ax.plot(x, iv_l, 'g--')\n",
    "ax.legend(loc=\"best\");"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Autoregressive Moving Average (ARMA)\n",
    "http://www.statsmodels.org/dev/examples/notebooks/generated/tsa_arma_0.html"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T18:02:04.846885Z",
     "start_time": "2018-11-08T18:02:04.661132Z"
    }
   },
   "outputs": [],
   "source": [
    "dta = sm.datasets.sunspots.load_pandas().data\n",
    "dta.index = pd.Index(sm.tsa.datetools.dates_from_range('1700', '2008'))\n",
    "del dta[\"YEAR\"]\n",
    "dta.plot()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T18:02:54.913119Z",
     "start_time": "2018-11-08T18:02:54.682118Z"
    }
   },
   "outputs": [],
   "source": [
    " _ = sm.graphics.tsa.plot_acf(dta.values.squeeze(), lags=40)\n",
    "_ = sm.graphics.tsa.plot_pacf(dta, lags=40)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T18:05:12.279648Z",
     "start_time": "2018-11-08T18:05:12.194182Z"
    }
   },
   "outputs": [],
   "source": [
    "arma_mod20 = sm.tsa.ARMA(dta, (2,0)).fit(disp=False)\n",
    "print(arma_mod20.params)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T18:07:25.650857Z",
     "start_time": "2018-11-08T18:07:25.527014Z"
    }
   },
   "outputs": [],
   "source": [
    "arma_mod30 = sm.tsa.ARMA(dta, (3,0)).fit(disp=False)\n",
    "print(arma_mod30.params)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T18:07:26.046296Z",
     "start_time": "2018-11-08T18:07:26.040966Z"
    }
   },
   "outputs": [],
   "source": [
    "print(arma_mod20.aic, arma_mod20.bic, arma_mod20.hqic)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T18:07:26.711582Z",
     "start_time": "2018-11-08T18:07:26.707818Z"
    }
   },
   "outputs": [],
   "source": [
    "print(arma_mod30.aic, arma_mod30.bic, arma_mod30.hqic)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T18:07:50.962394Z",
     "start_time": "2018-11-08T18:07:50.949200Z"
    }
   },
   "outputs": [],
   "source": [
    "sm.stats.durbin_watson(arma_mod30.resid.values)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T18:07:55.286855Z",
     "start_time": "2018-11-08T18:07:55.283054Z"
    }
   },
   "outputs": [],
   "source": [
    "arma_mod30.model.endog_names"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T18:07:56.594253Z",
     "start_time": "2018-11-08T18:07:56.590935Z"
    }
   },
   "outputs": [],
   "source": [
    "arma_mod30.model.exog_names"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T18:08:09.960714Z",
     "start_time": "2018-11-08T18:08:09.729586Z"
    }
   },
   "outputs": [],
   "source": [
    "_ = arma_mod20.plot_predict()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T18:07:58.586793Z",
     "start_time": "2018-11-08T18:07:58.407530Z"
    }
   },
   "outputs": [],
   "source": [
    "_ = arma_mod30.plot_predict()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T18:08:23.032832Z",
     "start_time": "2018-11-08T18:08:22.866801Z"
    }
   },
   "outputs": [],
   "source": [
    "arma_mod20.resid.plot()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T18:08:24.114883Z",
     "start_time": "2018-11-08T18:08:23.960027Z"
    }
   },
   "outputs": [],
   "source": [
    "arma_mod30.resid.plot()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T18:08:39.386774Z",
     "start_time": "2018-11-08T18:08:39.265069Z"
    }
   },
   "outputs": [],
   "source": [
    "plt.plot(arma_mod30.forecast(100)[0])\n",
    "plt.plot(arma_mod30.forecast(100)[2])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Seasonal Decompose"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T18:09:11.738439Z",
     "start_time": "2018-11-08T18:09:11.474671Z"
    },
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "co2 = pd.DataFrame(sm.datasets.co2.data.load().data)\n",
    "co2.index = pd.to_datetime(co2['date'].map(lambda x: x.decode(\"utf-8\") ))\n",
    "del co2['date']\n",
    "co2.plot()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T18:10:27.654384Z",
     "start_time": "2018-11-08T18:10:27.436381Z"
    }
   },
   "outputs": [],
   "source": [
    "co2.fillna(method='ffill', inplace=True)\n",
    "co2.plot()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T18:11:13.683857Z",
     "start_time": "2018-11-08T18:11:13.012664Z"
    }
   },
   "outputs": [],
   "source": [
    "decomp = sm.tsa.seasonal_decompose(co2, model='additive')\n",
    "additive = pd.DataFrame(index=co2.index)\n",
    "additive['Real'] = decomp.observed\n",
    "additive['Trend'] = decomp.trend\n",
    "additive['Seasonal'] = decomp.seasonal\n",
    "additive['Resid'] = decomp.resid\n",
    "\n",
    "_ = additive.plot(subplots=True, layout=(2, 2), figsize=(20, 10))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T18:11:14.493299Z",
     "start_time": "2018-11-08T18:11:13.685594Z"
    }
   },
   "outputs": [],
   "source": [
    "decomp = sm.tsa.seasonal_decompose(co2, model='multiplicative')\n",
    "multiplicative = pd.DataFrame(index=co2.index)\n",
    "multiplicative['Real'] = decomp.observed\n",
    "multiplicative['Trend'] = decomp.trend\n",
    "multiplicative['Seasonal'] = decomp.seasonal\n",
    "multiplicative['Resid'] = decomp.resid\n",
    "\n",
    "_ = multiplicative.plot(subplots=True, layout=(2, 2), figsize=(20, 10))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T18:11:58.488744Z",
     "start_time": "2018-11-08T18:11:58.217275Z"
    }
   },
   "outputs": [],
   "source": [
    "models = pd.DataFrame(index=co2.index)\n",
    "models['Real'] = decomp.observed\n",
    "models['Additive'] = additive['Trend'] + additive['Seasonal']\n",
    "models['Multiplicative'] = multiplicative['Trend']*multiplicative['Seasonal']\n",
    "models.plot()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T18:12:16.534263Z",
     "start_time": "2018-11-08T18:12:16.529540Z"
    }
   },
   "outputs": [],
   "source": [
    "np.mean((models['Real'] - models['Additive'])**2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T18:12:16.978757Z",
     "start_time": "2018-11-08T18:12:16.973133Z"
    }
   },
   "outputs": [],
   "source": [
    "np.mean((models['Real'] - models['Multiplicative'])**2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# PyMC3: Bayesian Statistical Modeling and Probabilistic Machine Learning "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T18:20:41.168892Z",
     "start_time": "2018-11-08T18:20:40.960336Z"
    }
   },
   "outputs": [],
   "source": [
    "%matplotlib inline\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sb\n",
    "import os\n",
    "os.environ['MKL_THREADING_LAYER'] = 'GNU'\n",
    "import pymc3 as pm\n",
    "\n",
    "data = 1.43 + 1.23*np.random.randn(1000)\n",
    "\n",
    "with pm.Model() as model:\n",
    "    mu = pm.Normal('mu', mu=0, sd=1)\n",
    "    sd = pm.Exponential('sd', lam=1)\n",
    "    obs = pm.Normal('obs', mu=mu, sd=sd, observed=data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T18:20:41.196052Z",
     "start_time": "2018-11-08T18:20:41.192727Z"
    }
   },
   "outputs": [],
   "source": [
    "model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T18:20:41.416818Z",
     "start_time": "2018-11-08T18:20:41.413312Z"
    }
   },
   "outputs": [],
   "source": [
    "model.basic_RVs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T18:20:41.639120Z",
     "start_time": "2018-11-08T18:20:41.636009Z"
    }
   },
   "outputs": [],
   "source": [
    "model.free_RVs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T18:20:41.823125Z",
     "start_time": "2018-11-08T18:20:41.820014Z"
    }
   },
   "outputs": [],
   "source": [
    "model.observed_RVs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T18:20:42.051862Z",
     "start_time": "2018-11-08T18:20:42.048257Z"
    }
   },
   "outputs": [],
   "source": [
    "model.deterministics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T18:20:42.393820Z",
     "start_time": "2018-11-08T18:20:42.389686Z"
    }
   },
   "outputs": [],
   "source": [
    "p = model.test_point.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T18:20:42.778045Z",
     "start_time": "2018-11-08T18:20:42.774292Z"
    }
   },
   "outputs": [],
   "source": [
    "p['mu'] = 1.45"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T18:20:43.913569Z",
     "start_time": "2018-11-08T18:20:43.307158Z"
    }
   },
   "outputs": [],
   "source": [
    "with model:\n",
    "    p = pm.find_MAP()\n",
    "print(p)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T18:20:44.632867Z",
     "start_time": "2018-11-08T18:20:43.914966Z"
    }
   },
   "outputs": [],
   "source": [
    "with model:\n",
    "    p = pm.find_MAP(start=p)\n",
    "print(p)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T18:20:45.252284Z",
     "start_time": "2018-11-08T18:20:44.634669Z"
    }
   },
   "outputs": [],
   "source": [
    "import scipy as sp\n",
    "\n",
    "\n",
    "with model:\n",
    "    p = pm.find_MAP(fmin=sp.optimize.fmin_powell)\n",
    "print(p)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T18:21:29.111581Z",
     "start_time": "2018-11-08T18:21:26.975434Z"
    }
   },
   "outputs": [],
   "source": [
    "with model:\n",
    "    trace = pm.sample(1000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T18:22:00.827496Z",
     "start_time": "2018-11-08T18:22:00.823368Z"
    }
   },
   "outputs": [],
   "source": [
    "trace"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T18:22:07.600108Z",
     "start_time": "2018-11-08T18:22:07.594363Z"
    }
   },
   "outputs": [],
   "source": [
    "trace.varnames"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T18:22:09.224776Z",
     "start_time": "2018-11-08T18:22:09.218686Z"
    }
   },
   "outputs": [],
   "source": [
    "trace['mu']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T18:22:17.543800Z",
     "start_time": "2018-11-08T18:22:17.129996Z"
    }
   },
   "outputs": [],
   "source": [
    "pm.traceplot(trace)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T18:23:28.267025Z",
     "start_time": "2018-11-08T18:23:27.807306Z"
    }
   },
   "outputs": [],
   "source": [
    "_ = pm.traceplot(trace[500:])\n",
    "pm.summary(trace[500:])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T18:25:24.255912Z",
     "start_time": "2018-11-08T18:25:22.959167Z"
    }
   },
   "outputs": [],
   "source": [
    "with model:\n",
    "    trace_m = pm.sample(1000, step=pm.Metropolis())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T18:25:29.828718Z",
     "start_time": "2018-11-08T18:25:29.382967Z"
    }
   },
   "outputs": [],
   "source": [
    "_ = pm.traceplot(trace_m[500:])\n",
    "pm.summary(trace_m[500:])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T18:25:43.563492Z",
     "start_time": "2018-11-08T18:25:41.265997Z"
    }
   },
   "outputs": [],
   "source": [
    "with model:\n",
    "    trace_s = pm.sample(1000, step=pm.Slice(), njobs=5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T18:25:51.960296Z",
     "start_time": "2018-11-08T18:25:51.449060Z"
    }
   },
   "outputs": [],
   "source": [
    "_ = pm.traceplot(trace_s[500:])\n",
    "pm.summary(trace_s[500:])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T18:26:51.323440Z",
     "start_time": "2018-11-08T18:26:51.091582Z"
    }
   },
   "outputs": [],
   "source": [
    "with model:\n",
    "    post_pred = pm.sample_ppc(trace_s[500:], samples=500, size=len(data))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T18:26:54.356677Z",
     "start_time": "2018-11-08T18:26:54.195708Z"
    },
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "plt.figure()\n",
    "ax = sb.distplot(post_pred['obs'].mean(axis=1), label='Posterior predictive means')\n",
    "ax.axvline(data.mean(), color='r', ls='--', label='True mean')\n",
    "ax.legend()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Scikit Learn: Machine Learning in Python\n",
    "## Clustering"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T18:30:51.579127Z",
     "start_time": "2018-11-08T18:30:51.560908Z"
    },
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import sklearn as sk\n",
    "\n",
    "from sklearn import cluster, mixture\n",
    "n_samples = 500\n",
    "\n",
    "C = np.array([[0., -0.1, 3.0], [1.7, .4, 3.1]])\n",
    "X = np.concatenate([np.dot(np.random.randn(n_samples, 2), C),\n",
    "                   .7 * np.random.randn(n_samples, 3) + np.array([-6, 3, 2])])\n",
    "X = pd.DataFrame(X, columns=['x1', 'x2', 'x3'])\n",
    "X"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T18:31:48.219012Z",
     "start_time": "2018-11-08T18:31:48.192357Z"
    }
   },
   "outputs": [],
   "source": [
    "kmeans = cluster.KMeans(n_clusters=2).fit(X)\n",
    "kmeans"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T18:31:59.371293Z",
     "start_time": "2018-11-08T18:31:59.364590Z"
    },
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "kmeans.predict(X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T18:32:12.928767Z",
     "start_time": "2018-11-08T18:32:12.924334Z"
    }
   },
   "outputs": [],
   "source": [
    "kmeans.cluster_centers_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T18:32:47.672993Z",
     "start_time": "2018-11-08T18:32:47.665185Z"
    }
   },
   "outputs": [],
   "source": [
    "%matplotlib inline\n",
    "import itertools\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "\n",
    "def plot_clusters(X, cluster, title):\n",
    "    for ids in [[0, 1], [0, 2], [1, 2]]:\n",
    "        plt.figure(figsize=(15, 3))\n",
    "        plot = plt.subplot(111)\n",
    "        color_iter = itertools.cycle(['navy', 'c', 'cornflowerblue', 'gold', 'darkorange', 'r', 'g', 'k', 'm'])\n",
    "        for i, color in zip(range(min(cluster), max(cluster)+1), color_iter):\n",
    "            if not np.any(cluster == i):\n",
    "                continue\n",
    "            plt.scatter(X[cluster == i, ids[0]], X[cluster == i, ids[1]], .8, color=color)\n",
    "        plt.title(title + str(ids))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T18:32:52.701958Z",
     "start_time": "2018-11-08T18:32:52.388694Z"
    }
   },
   "outputs": [],
   "source": [
    "plot_clusters(X.values, kmeans.predict(X), 'KMeans')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T18:33:25.557712Z",
     "start_time": "2018-11-08T18:33:25.123186Z"
    }
   },
   "outputs": [],
   "source": [
    "kmeans = cluster.KMeans(n_clusters=10).fit(X)\n",
    "\n",
    "plot_clusters(X.values, kmeans.predict(X), 'KMeans')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T18:34:48.761611Z",
     "start_time": "2018-11-08T18:34:48.359539Z"
    }
   },
   "outputs": [],
   "source": [
    "agglomerative = cluster.AgglomerativeClustering(n_clusters=10, linkage='ward').fit(X)\n",
    "\n",
    "plot_clusters(X.values, agglomerative.fit_predict(X), 'AgglomerativeClustering')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T18:35:01.963545Z",
     "start_time": "2018-11-08T18:35:01.576697Z"
    }
   },
   "outputs": [],
   "source": [
    "birch = cluster.Birch(branching_factor=50, n_clusters=10, threshold=0.5, compute_labels=True).fit(X)\n",
    "plot_clusters(X.values, birch.predict(X), 'Birch')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T18:35:17.097851Z",
     "start_time": "2018-11-08T18:35:10.890679Z"
    }
   },
   "outputs": [],
   "source": [
    "affinity = cluster.AffinityPropagation(preference=-100).fit(X)\n",
    "\n",
    "plot_clusters(X.values, affinity.predict(X), 'AffinityPropagation')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T18:36:07.575077Z",
     "start_time": "2018-11-08T18:36:07.209363Z"
    }
   },
   "outputs": [],
   "source": [
    "dbscan = cluster.DBSCAN(eps=0.5, min_samples=10).fit(X)\n",
    "plot_clusters(X.values, dbscan.fit_predict(X), 'DBSCAN')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T18:36:23.715257Z",
     "start_time": "2018-11-08T18:36:22.284064Z"
    }
   },
   "outputs": [],
   "source": [
    "bandwidth = cluster.estimate_bandwidth(X.values, quantile=0.01, n_samples=500)\n",
    "meanshift = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True).fit(X)\n",
    "\n",
    "plot_clusters(X.values, meanshift.predict(X), 'MeanShift')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T18:36:34.799567Z",
     "start_time": "2018-11-08T18:36:34.387441Z"
    }
   },
   "outputs": [],
   "source": [
    "gmm = mixture.GaussianMixture(n_components=10, covariance_type='full').fit(X)\n",
    "plot_clusters(X.values, gmm.predict(X), 'Gaussian Mixture')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T18:36:55.939105Z",
     "start_time": "2018-11-08T18:36:55.447435Z"
    }
   },
   "outputs": [],
   "source": [
    "dpgmm = mixture.BayesianGaussianMixture(n_components=10, covariance_type='full').fit(X)\n",
    "plot_clusters(X.values, dpgmm.predict(X), 'Bayesian Gaussian Mixture with a Dirichlet process prior')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Classification"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T18:38:06.533971Z",
     "start_time": "2018-11-08T18:38:05.877782Z"
    }
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "from matplotlib.colors import ListedColormap\n",
    "from sklearn import datasets\n",
    "from sklearn.neighbors import NearestCentroid\n",
    "\n",
    "n_neighbors = 15\n",
    "\n",
    "# import some data to play with\n",
    "iris = datasets.load_iris()\n",
    "# we only take the first two features. We could avoid this ugly\n",
    "# slicing by using a two-dim dataset\n",
    "X = iris.data[:, :2]\n",
    "y = iris.target\n",
    "\n",
    "h = .02  # step size in the mesh\n",
    "\n",
    "# Create color maps\n",
    "cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])\n",
    "cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])\n",
    "\n",
    "for shrinkage in [None, 1.0, .2, 0.1]:\n",
    "    # we create an instance of Neighbours Classifier and fit the data.\n",
    "    clf = NearestCentroid(shrink_threshold=shrinkage)\n",
    "    clf.fit(X, y)\n",
    "    y_pred = clf.predict(X)\n",
    "\n",
    "    # Plot the decision boundary. For that, we will assign a color to each\n",
    "    # point in the mesh [x_min, x_max]x[y_min, y_max].\n",
    "    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1\n",
    "    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1\n",
    "    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),\n",
    "                         np.arange(y_min, y_max, h))\n",
    "    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])\n",
    "\n",
    "    # Put the result into a color plot\n",
    "    Z = Z.reshape(xx.shape)\n",
    "    plt.figure(figsize=(10, 5))\n",
    "    plt.pcolormesh(xx, yy, Z, cmap=cmap_light)\n",
    "\n",
    "    # Plot also the training points\n",
    "    plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold,\n",
    "                edgecolor='b', s=20)\n",
    "    plt.title(\"3-Class classification (shrink_threshold={}, precision={})\".format(shrinkage, np.mean(y == y_pred)))\n",
    "    plt.axis('tight')\n",
    "\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T18:40:59.645937Z",
     "start_time": "2018-11-08T18:40:59.042862Z"
    }
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "from matplotlib.colors import ListedColormap\n",
    "from sklearn import neighbors, datasets\n",
    "\n",
    "n_neighbors = 15\n",
    "\n",
    "# import some data to play with\n",
    "iris = datasets.load_iris()\n",
    "\n",
    "# we only take the first two features. We could avoid this ugly\n",
    "# slicing by using a two-dim dataset\n",
    "X = iris.data[:, :2]\n",
    "y = iris.target\n",
    "\n",
    "h = .02  # step size in the mesh\n",
    "\n",
    "# Create color maps\n",
    "cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])\n",
    "cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])\n",
    "\n",
    "for weights in ['uniform', 'distance']:\n",
    "    # we create an instance of Neighbours Classifier and fit the data.\n",
    "    clf = neighbors.KNeighborsClassifier(n_neighbors, weights=weights)\n",
    "    clf.fit(X, y)\n",
    "\n",
    "    # Plot the decision boundary. For that, we will assign a color to each\n",
    "    # point in the mesh [x_min, x_max]x[y_min, y_max].\n",
    "    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1\n",
    "    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1\n",
    "    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),\n",
    "                         np.arange(y_min, y_max, h))\n",
    "    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])\n",
    "\n",
    "    # Put the result into a color plot\n",
    "    Z = Z.reshape(xx.shape)\n",
    "    plt.figure(figsize=(10, 5))\n",
    "    plt.pcolormesh(xx, yy, Z, cmap=cmap_light)\n",
    "\n",
    "    # Plot also the training points\n",
    "    plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold,\n",
    "                edgecolor='k', s=20)\n",
    "    plt.xlim(xx.min(), xx.max())\n",
    "    plt.ylim(yy.min(), yy.max())\n",
    "    plt.title(\"3-Class classification (k = %i, weights = '%s')\" % (n_neighbors, weights))\n",
    "\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T18:44:36.647838Z",
     "start_time": "2018-11-08T18:44:29.575452Z"
    }
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "from matplotlib.colors import ListedColormap\n",
    "\n",
    "from sklearn import clone\n",
    "from sklearn.datasets import load_iris\n",
    "from sklearn.ensemble import (RandomForestClassifier, ExtraTreesClassifier,\n",
    "                              AdaBoostClassifier)\n",
    "from sklearn.tree import DecisionTreeClassifier\n",
    "\n",
    "# Parameters\n",
    "n_classes = 3\n",
    "n_estimators = 30\n",
    "cmap = plt.cm.RdYlBu\n",
    "plot_step = 0.02  # fine step width for decision surface contours\n",
    "plot_step_coarser = 0.5  # step widths for coarse classifier guesses\n",
    "RANDOM_SEED = 13  # fix the seed on each iteration\n",
    "\n",
    "# Load data\n",
    "iris = load_iris()\n",
    "\n",
    "plot_idx = 1\n",
    "\n",
    "models = [DecisionTreeClassifier(max_depth=None),\n",
    "          RandomForestClassifier(n_estimators=n_estimators),\n",
    "          ExtraTreesClassifier(n_estimators=n_estimators),\n",
    "          AdaBoostClassifier(DecisionTreeClassifier(max_depth=3),\n",
    "                             n_estimators=n_estimators)]\n",
    "plt.figure(figsize=(20, 15))\n",
    "for pair in ([0, 1], [0, 2], [2, 3]):\n",
    "    for model in models:\n",
    "        # We only take the two corresponding features\n",
    "        X = iris.data[:, pair]\n",
    "        y = iris.target\n",
    "\n",
    "        # Shuffle\n",
    "        idx = np.arange(X.shape[0])\n",
    "        np.random.seed(RANDOM_SEED)\n",
    "        np.random.shuffle(idx)\n",
    "        X = X[idx]\n",
    "        y = y[idx]\n",
    "\n",
    "        # Standardize\n",
    "        mean = X.mean(axis=0)\n",
    "        std = X.std(axis=0)\n",
    "        X = (X - mean) / std\n",
    "\n",
    "        # Train\n",
    "        clf = clone(model)\n",
    "        clf = model.fit(X, y)\n",
    "\n",
    "        scores = clf.score(X, y)\n",
    "        # Create a title for each column and the console by using str() and\n",
    "        # slicing away useless parts of the string\n",
    "        model_title = str(type(model)).split(\n",
    "            \".\")[-1][:-2][:-len(\"Classifier\")]\n",
    "\n",
    "        model_details = model_title\n",
    "        if hasattr(model, \"estimators_\"):\n",
    "            model_details += \" with {} estimators\".format(\n",
    "                len(model.estimators_))\n",
    "        print(model_details + \" with features\", pair,\n",
    "              \"has a score of\", scores)\n",
    "\n",
    "        plt.subplot(3, 4, plot_idx)\n",
    "        if plot_idx <= len(models):\n",
    "            # Add a title at the top of each column\n",
    "            plt.title(model_title)\n",
    "\n",
    "        # Now plot the decision boundary using a fine mesh as input to a\n",
    "        # filled contour plot\n",
    "        x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1\n",
    "        y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1\n",
    "        xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step),\n",
    "                             np.arange(y_min, y_max, plot_step))\n",
    "\n",
    "        # Plot either a single DecisionTreeClassifier or alpha blend the\n",
    "        # decision surfaces of the ensemble of classifiers\n",
    "        if isinstance(model, DecisionTreeClassifier):\n",
    "            Z = model.predict(np.c_[xx.ravel(), yy.ravel()])\n",
    "            Z = Z.reshape(xx.shape)\n",
    "            cs = plt.contourf(xx, yy, Z, cmap=cmap)\n",
    "        else:\n",
    "            # Choose alpha blend level with respect to the number\n",
    "            # of estimators\n",
    "            # that are in use (noting that AdaBoost can use fewer estimators\n",
    "            # than its maximum if it achieves a good enough fit early on)\n",
    "            estimator_alpha = 1.0 / len(model.estimators_)\n",
    "            for tree in model.estimators_:\n",
    "                Z = tree.predict(np.c_[xx.ravel(), yy.ravel()])\n",
    "                Z = Z.reshape(xx.shape)\n",
    "                cs = plt.contourf(xx, yy, Z, alpha=estimator_alpha, cmap=cmap)\n",
    "\n",
    "        # Build a coarser grid to plot a set of ensemble classifications\n",
    "        # to show how these are different to what we see in the decision\n",
    "        # surfaces. These points are regularly space and do not have a\n",
    "        # black outline\n",
    "        xx_coarser, yy_coarser = np.meshgrid(\n",
    "            np.arange(x_min, x_max, plot_step_coarser),\n",
    "            np.arange(y_min, y_max, plot_step_coarser))\n",
    "        Z_points_coarser = model.predict(np.c_[xx_coarser.ravel(),\n",
    "                                         yy_coarser.ravel()]\n",
    "                                         ).reshape(xx_coarser.shape)\n",
    "        cs_points = plt.scatter(xx_coarser, yy_coarser, s=15,\n",
    "                                c=Z_points_coarser, cmap=cmap,\n",
    "                                edgecolors=\"none\")\n",
    "\n",
    "        # Plot the training points, these are clustered together and have a\n",
    "        # black outline\n",
    "        plt.scatter(X[:, 0], X[:, 1], c=y,\n",
    "                    cmap=ListedColormap(['r', 'y', 'b']),\n",
    "                    edgecolor='k', s=20)\n",
    "        plot_idx += 1  # move on to the next plot in sequence\n",
    "\n",
    "plt.suptitle(\"Classifiers on feature subsets of the Iris dataset\")\n",
    "plt.axis(\"tight\")\n",
    "\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Dimensionality Reduction"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T18:48:15.700893Z",
     "start_time": "2018-11-08T18:48:15.441447Z"
    }
   },
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "\n",
    "from sklearn import datasets\n",
    "from sklearn.decomposition import PCA\n",
    "from sklearn.discriminant_analysis import LinearDiscriminantAnalysis\n",
    "\n",
    "iris = datasets.load_iris()\n",
    "\n",
    "X = iris.data\n",
    "y = iris.target\n",
    "target_names = iris.target_names\n",
    "\n",
    "pca = PCA(n_components=2)\n",
    "X_r = pca.fit(X).transform(X)\n",
    "\n",
    "lda = LinearDiscriminantAnalysis(n_components=2)\n",
    "X_r2 = lda.fit(X, y).transform(X)\n",
    "\n",
    "# Percentage of variance explained for each components\n",
    "print('explained variance ratio (first two components): %s' % str(pca.explained_variance_ratio_))\n",
    "\n",
    "plt.figure()\n",
    "colors = ['navy', 'turquoise', 'darkorange']\n",
    "lw = 2\n",
    "\n",
    "for color, i, target_name in zip(colors, [0, 1, 2], target_names):\n",
    "    plt.scatter(X_r[y == i, 0], X_r[y == i, 1], color=color, alpha=.8, lw=lw,\n",
    "                label=target_name)\n",
    "plt.legend(loc='best', shadow=False, scatterpoints=1)\n",
    "plt.title('PCA of IRIS dataset')\n",
    "\n",
    "plt.figure()\n",
    "for color, i, target_name in zip(colors, [0, 1, 2], target_names):\n",
    "    plt.scatter(X_r2[y == i, 0], X_r2[y == i, 1], alpha=.8, color=color,\n",
    "                label=target_name)\n",
    "plt.legend(loc='best', shadow=False, scatterpoints=1)\n",
    "plt.title('LDA of IRIS dataset')\n",
    "\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T18:50:10.108399Z",
     "start_time": "2018-11-08T18:50:09.762862Z"
    }
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "from sklearn.decomposition import PCA, KernelPCA\n",
    "from sklearn.datasets import make_circles\n",
    "\n",
    "np.random.seed(0)\n",
    "\n",
    "X, y = make_circles(n_samples=400, factor=.3, noise=.05)\n",
    "\n",
    "plt.figure(figsize=(12, 12))\n",
    "plt.subplot(2, 2, 1, aspect='equal')\n",
    "plt.title(\"Original space\")\n",
    "reds = y == 0\n",
    "blues = y == 1\n",
    "plt.scatter(X[reds, 0], X[reds, 1], c=\"red\", s=20, edgecolor='k')\n",
    "plt.scatter(X[blues, 0], X[blues, 1], c=\"blue\", s=20, edgecolor='k')\n",
    "plt.xlabel(\"$x_1$\")\n",
    "plt.ylabel(\"$x_2$\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-08T18:52:36.591746Z",
     "start_time": "2018-11-08T18:52:36.148517Z"
    }
   },
   "outputs": [],
   "source": [
    "kpca = KernelPCA(kernel=\"rbf\", fit_inverse_transform=True, gamma=10)\n",
    "X_kpca = kpca.fit_transform(X)\n",
    "X_back = kpca.inverse_transform(X_kpca)\n",
    "pca = PCA()\n",
    "X_pca = pca.fit_transform(X)\n",
    "\n",
    "# Plot results\n",
    "\n",
    "plt.figure(figsize=(12, 12))\n",
    "plt.subplot(2, 2, 1, aspect='equal')\n",
    "plt.title(\"Original space\")\n",
    "reds = y == 0\n",
    "blues = y == 1\n",
    "\n",
    "plt.scatter(X[reds, 0], X[reds, 1], c=\"red\", s=20, edgecolor='k')\n",
    "plt.scatter(X[blues, 0], X[blues, 1], c=\"blue\", s=20, edgecolor='k')\n",
    "plt.xlabel(\"$x_1$\")\n",
    "plt.ylabel(\"$x_2$\")\n",
    "\n",
    "X1, X2 = np.meshgrid(np.linspace(-1.5, 1.5, 50), np.linspace(-1.5, 1.5, 50))\n",
    "X_grid = np.array([np.ravel(X1), np.ravel(X2)]).T\n",
    "# projection on the first principal component (in the phi space)\n",
    "Z_grid = kpca.transform(X_grid)[:, 0].reshape(X1.shape)\n",
    "plt.contour(X1, X2, Z_grid, colors='grey', linewidths=1, origin='lower')\n",
    "\n",
    "plt.subplot(2, 2, 2, aspect='equal')\n",
    "plt.scatter(X_pca[reds, 0], X_pca[reds, 1], c=\"red\", s=20, edgecolor='k')\n",
    "plt.scatter(X_pca[blues, 0], X_pca[blues, 1], c=\"blue\", s=20, edgecolor='k')\n",
    "plt.title(\"Projection by PCA\")\n",
    "plt.xlabel(\"1st principal component\")\n",
    "plt.ylabel(\"2nd component\")\n",
    "\n",
    "plt.subplot(2, 2, 3, aspect='equal')\n",
    "plt.scatter(X_kpca[reds, 0], X_kpca[reds, 1], c=\"red\", s=20, edgecolor='k')\n",
    "plt.scatter(X_kpca[blues, 0], X_kpca[blues, 1], c=\"blue\", s=20, edgecolor='k')\n",
    "plt.title(\"Projection by KPCA\")\n",
    "plt.xlabel(\"1st principal component in space induced by $\\phi$\")\n",
    "plt.ylabel(\"2nd component\")\n",
    "\n",
    "plt.subplot(2, 2, 4, aspect='equal')\n",
    "plt.scatter(X_back[reds, 0], X_back[reds, 1], c=\"red\", s=20, edgecolor='k')\n",
    "plt.scatter(X_back[blues, 0], X_back[blues, 1], c=\"blue\", s=20, edgecolor='k')\n",
    "plt.title(\"Original space after inverse transform\")\n",
    "plt.xlabel(\"$x_1$\")\n",
    "plt.ylabel(\"$x_2$\")\n",
    "\n",
    "plt.subplots_adjust(0.02, 0.10, 0.98, 0.94, 0.04, 0.35)\n",
    "\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.0"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": true
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}