{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Scientifics Packages\n", "Gonzalo Rios (grios@dim.uchile.cl)\n", "\n", "http://scikits.appspot.com/scikits\n", "\n", "https://www.scipy.org/topical-software.html" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Sympy: Symbolic Mathematics in Python\n", "http://www.sympy.org" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T17:35:03.899222Z", "start_time": "2018-11-08T17:35:03.530969Z" } }, "outputs": [], "source": [ "import sympy as sy\n", "x, y = sy.symbols('x y')\n", "x, y" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T17:35:18.708308Z", "start_time": "2018-11-08T17:35:18.701580Z" } }, "outputs": [], "source": [ "expr = x + 2*y\n", "expr" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T17:35:29.373796Z", "start_time": "2018-11-08T17:35:29.367747Z" } }, "outputs": [], "source": [ "expr + 1" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T17:35:31.704670Z", "start_time": "2018-11-08T17:35:31.693832Z" } }, "outputs": [], "source": [ "expr + 1 - x" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T17:35:49.349256Z", "start_time": "2018-11-08T17:35:49.345237Z" } }, "outputs": [], "source": [ "x*expr" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T17:35:56.684739Z", "start_time": "2018-11-08T17:35:56.678400Z" } }, "outputs": [], "source": [ "sy.expand(x*expr)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T17:36:07.805939Z", "start_time": "2018-11-08T17:36:07.783355Z" } }, "outputs": [], "source": [ "sy.factor(sy.expand(x*expr))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T17:36:27.403448Z", "start_time": "2018-11-08T17:36:27.396924Z" } }, "outputs": [], "source": [ "sy.diff(sy.sin(x)*sy.exp(x), x)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T17:37:00.562619Z", "start_time": "2018-11-08T17:37:00.185132Z" } }, "outputs": [], "source": [ "sy.integrate(sy.exp(x)*sy.sin(x) + sy.exp(x)*sy.cos(x), x)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T17:37:23.719571Z", "start_time": "2018-11-08T17:37:23.674013Z" } }, "outputs": [], "source": [ "r = sy.limit(sy.sin(x)/x, x, 0)\n", "r" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T17:37:34.527393Z", "start_time": "2018-11-08T17:37:34.507056Z" } }, "outputs": [], "source": [ "sy.solve(x**2 - 2, x)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T17:37:57.331434Z", "start_time": "2018-11-08T17:37:57.325689Z" } }, "outputs": [], "source": [ "sy.Integral(sy.cos(x)**2, (x, 0, sy.pi))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T17:38:11.981472Z", "start_time": "2018-11-08T17:38:11.977311Z" } }, "outputs": [], "source": [ "sy.pprint(sy.Integral(sy.cos(x)**2, (x, 0, sy.pi)))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T17:38:31.524171Z", "start_time": "2018-11-08T17:38:31.515698Z" } }, "outputs": [], "source": [ "print(sy.latex(sy.Integral(sy.cos(x)**2, (x, 0, sy.pi))))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T17:38:35.950664Z", "start_time": "2018-11-08T17:38:35.942899Z" } }, "outputs": [], "source": [ "%%latex\n", "$\\int_{0}^{\\pi} \\cos^{2}{\\left (x \\right )}\\, dx$" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T17:38:55.590372Z", "start_time": "2018-11-08T17:38:55.573481Z" } }, "outputs": [], "source": [ "sy.Integral(sy.cos(x)**2, (x, 0, sy.pi)).evalf()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T17:39:13.305995Z", "start_time": "2018-11-08T17:39:13.301251Z" } }, "outputs": [], "source": [ "expr = sy.diff(sy.sin(x)*sy.exp(x), x)\n", "expr" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T17:39:31.767547Z", "start_time": "2018-11-08T17:39:31.763212Z" } }, "outputs": [], "source": [ "expr.evalf(subs={x: 3.14})" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T17:40:03.995871Z", "start_time": "2018-11-08T17:40:03.809186Z" } }, "outputs": [], "source": [ "f = sy.lambdify(x, expr)\n", "f" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T17:40:08.505977Z", "start_time": "2018-11-08T17:40:08.502772Z" } }, "outputs": [], "source": [ "f(3.14)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T17:40:27.743742Z", "start_time": "2018-11-08T17:40:27.734482Z" } }, "outputs": [], "source": [ "fnp = sy.lambdify(x, expr, 'numpy')\n", "fnp" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T17:40:33.166953Z", "start_time": "2018-11-08T17:40:33.161863Z" } }, "outputs": [], "source": [ "fnp(3.14)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T17:40:48.833113Z", "start_time": "2018-11-08T17:40:46.040399Z" } }, "outputs": [], "source": [ "%timeit expr.evalf(subs={x: 3.14})" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T17:40:51.134217Z", "start_time": "2018-11-08T17:40:48.835435Z" } }, "outputs": [], "source": [ "%timeit f(3.14)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T17:40:54.012299Z", "start_time": "2018-11-08T17:40:51.135967Z" } }, "outputs": [], "source": [ "%timeit fnp(3.14)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T17:41:22.934874Z", "start_time": "2018-11-08T17:41:22.927894Z" } }, "outputs": [], "source": [ "import numpy as np\n", "x_test = np.random.randn(1000)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T17:41:30.990565Z", "start_time": "2018-11-08T17:41:25.618306Z" } }, "outputs": [], "source": [ "%timeit fnp(x_test)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T17:41:34.440993Z", "start_time": "2018-11-08T17:41:30.992122Z" } }, "outputs": [], "source": [ "%timeit np.array([f(i) for i in x_test])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# StatsModels: Statistic in Python\n", "http://www.statsmodels.org/stable/" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T17:43:46.583020Z", "start_time": "2018-11-08T17:43:45.348545Z" }, "scrolled": true }, "outputs": [], "source": [ "import numpy as np\n", "import statsmodels.api as sm\n", "import matplotlib.pyplot as plt\n", "import seaborn as sb\n", "%matplotlib inline\n", "\n", "sb.set(context='notebook', rc={\"figure.figsize\": (20, 8)})\n", "\n", "nobs = 10000\n", "X = np.random.random((nobs, 2))\n", "X = sm.add_constant(X)\n", "X" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T17:45:06.676118Z", "start_time": "2018-11-08T17:45:06.227995Z" } }, "outputs": [], "source": [ "beta = [2.2, .3, .8]\n", "e = np.random.randn(nobs)*0.2\n", "\n", "y = np.dot(X, beta) + e\n", "plt.plot(y, 'ro')\n", "plt.show()\n", "plt.plot(X[:, 0], 'r', label=\"0\")\n", "plt.plot(X[:, 1], 'bo', label=\"1\")\n", "plt.plot(X[:, 2], 'g*', label=\"2\")\n", "plt.legend()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T17:46:00.751773Z", "start_time": "2018-11-08T17:46:00.729248Z" } }, "outputs": [], "source": [ "results = sm.OLS(y, X).fit()\n", "results.summary()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T17:47:26.050918Z", "start_time": "2018-11-08T17:47:22.741061Z" } }, "outputs": [], "source": [ "import pandas as pd\n", "import statsmodels.formula.api as smf\n", "\n", "try:\n", " dat = sm.datasets.get_rdataset(\"Guerry\", \"HistData\").data\n", " dat.to_hdf('Guerry.h5', key='HistData', mode='w')\n", " print('Load dataset from R')\n", "except:\n", " dat = pd.read_hdf('Guerry.h5')\n", " print('Load dataset from HDF file')\n", "dat" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T17:48:33.345657Z", "start_time": "2018-11-08T17:48:33.229969Z" } }, "outputs": [], "source": [ "dat.Lottery.plot(style='ro')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T17:49:38.041610Z", "start_time": "2018-11-08T17:49:37.998645Z" } }, "outputs": [], "source": [ "results = smf.ols('Lottery ~ Literacy + Infanticide**2 + np.log(Pop1831)', data=dat).fit()\n", "results.summary()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T17:51:25.689879Z", "start_time": "2018-11-08T17:51:25.686247Z" } }, "outputs": [], "source": [ "results" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T17:51:28.750555Z", "start_time": "2018-11-08T17:51:28.746479Z" } }, "outputs": [], "source": [ "print('R2: ', results.rsquared)\n", "print('Parameters: ', results.params)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Statistics Tests" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T17:52:25.137875Z", "start_time": "2018-11-08T17:52:24.890415Z" } }, "outputs": [], "source": [ "results.resid.hist(bins=100)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T17:53:16.274308Z", "start_time": "2018-11-08T17:53:16.258706Z" } }, "outputs": [], "source": [ "from statsmodels.compat import lzip\n", "import statsmodels.stats.api as sms\n", "\n", "name = ['Jarque-Bera', 'Chi^2 two-tail prob.', 'Skew', 'Kurtosis']\n", "test = sms.jarque_bera(results.resid)\n", "lzip(name, test)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T17:53:39.016483Z", "start_time": "2018-11-08T17:53:39.011277Z" } }, "outputs": [], "source": [ "name = ['Chi^2', 'Two-tail probability']\n", "test = sms.omni_normtest(results.resid)\n", "lzip(name, test)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T17:53:58.154214Z", "start_time": "2018-11-08T17:53:58.141387Z" } }, "outputs": [], "source": [ "name = ['Lagrange multiplier statistic', 'p-value', 'f-value', 'f p-value']\n", "test = sms.het_breushpagan(results.resid, results.model.exog)\n", "lzip(name, test)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T17:54:04.314049Z", "start_time": "2018-11-08T17:54:04.306469Z" } }, "outputs": [], "source": [ "name = ['F statistic', 'p-value']\n", "test = sms.het_goldfeldquandt(results.resid, results.model.exog)\n", "lzip(name, test)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T17:54:06.448338Z", "start_time": "2018-11-08T17:54:06.438435Z" } }, "outputs": [], "source": [ "name = ['t value', 'p value']\n", "test = sms.linear_harvey_collier(results)\n", "lzip(name, test)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T17:54:13.698572Z", "start_time": "2018-11-08T17:54:13.690270Z" } }, "outputs": [], "source": [ "np.linalg.cond(results.model.exog)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T17:54:29.099781Z", "start_time": "2018-11-08T17:54:28.919522Z" } }, "outputs": [], "source": [ "from statsmodels.graphics.regressionplots import plot_leverage_resid2\n", "\n", "_ = plot_leverage_resid2(results)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Non-Linear Regression" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T17:56:01.839623Z", "start_time": "2018-11-08T17:56:01.738823Z" } }, "outputs": [], "source": [ "nsample = 50\n", "sig = 0.5\n", "x = np.linspace(0, 20, nsample)\n", "X = np.column_stack((x, np.sin(x), (x-5)**2, np.ones(nsample)))\n", "beta = [0.5, 0.5, -0.02, 5.]\n", "\n", "y_true = np.dot(X, beta)\n", "y = y_true + sig * np.random.normal(size=nsample)\n", "plt.plot(y, 'ro')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T17:56:22.820250Z", "start_time": "2018-11-08T17:56:22.803991Z" } }, "outputs": [], "source": [ "res = sm.OLS(y, X).fit()\n", "res.summary()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T17:57:03.575614Z", "start_time": "2018-11-08T17:57:03.571363Z" } }, "outputs": [], "source": [ "print('Parameters: ', res.params)\n", "print('Standard errors: ', res.bse)\n", "print('Predicted values: ', res.predict())" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T17:57:53.241373Z", "start_time": "2018-11-08T17:57:53.104650Z" } }, "outputs": [], "source": [ "from statsmodels.sandbox.regression.predstd import wls_prediction_std\n", "\n", "prstd, iv_l, iv_u = wls_prediction_std(res)\n", "\n", "plt.plot(y, 'o', label=\"data\")\n", "plt.plot(y_true, 'b-', label=\"True\")\n", "plt.plot(res.fittedvalues, 'r--.', label=\"OLS\")\n", "plt.plot(iv_u, 'r--')\n", "plt.plot(iv_l, 'r--')\n", "plt.legend(loc='best')" ] }, { "cell_type": "markdown", "metadata": { "ExecuteTime": { "end_time": "2017-09-06T20:16:22.381177Z", "start_time": "2017-09-06T20:16:22.375969Z" } }, "source": [ "## Heteroscedasticity" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T17:58:41.425846Z", "start_time": "2018-11-08T17:58:41.419085Z" } }, "outputs": [], "source": [ "nsample = 50\n", "x = np.linspace(0, 20, nsample)\n", "X = np.column_stack((x, (x - 5)**2))\n", "X = sm.add_constant(X)\n", "beta = [5., 0.5, -0.01]\n", "sig = 0.5\n", "w = np.ones(nsample)\n", "w[nsample * 6//10:] = 3\n", "y_true = np.dot(X, beta)\n", "e = np.random.normal(size=nsample)\n", "y = y_true + sig * w * e \n", "X = X[:,[0,1]]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T17:59:01.554727Z", "start_time": "2018-11-08T17:59:01.421397Z" } }, "outputs": [], "source": [ "plt.plot(y, '*')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T18:00:39.426814Z", "start_time": "2018-11-08T18:00:39.411207Z" } }, "outputs": [], "source": [ "mod_wls = sm.WLS(y, X, weights=1./w)\n", "res_wls = mod_wls.fit()\n", "res_wls.summary()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T18:00:54.027411Z", "start_time": "2018-11-08T18:00:54.013512Z" } }, "outputs": [], "source": [ "res_ols = sm.OLS(y, X).fit()\n", "res_ols.summary()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T18:01:11.791916Z", "start_time": "2018-11-08T18:01:11.780312Z" } }, "outputs": [], "source": [ "print('OLS', res_ols.params)\n", "print('WLS', res_wls.params)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T18:01:21.602371Z", "start_time": "2018-11-08T18:01:21.432660Z" } }, "outputs": [], "source": [ "prstd_ols, iv_l_ols, iv_u_ols = wls_prediction_std(res_ols)\n", "prstd, iv_l, iv_u = wls_prediction_std(res_wls)\n", "\n", "fig, ax = plt.subplots(figsize=(8,6))\n", "ax.plot(x, y, 'o', label=\"Data\")\n", "ax.plot(x, y_true, 'b-', label=\"True\")\n", "# OLS\n", "ax.plot(x, res_ols.fittedvalues, 'r--')\n", "ax.plot(x, iv_u_ols, 'r--', label=\"OLS\")\n", "ax.plot(x, iv_l_ols, 'r--')\n", "# WLS\n", "ax.plot(x, res_wls.fittedvalues, 'g--.')\n", "ax.plot(x, iv_u, 'g--', label=\"WLS\")\n", "ax.plot(x, iv_l, 'g--')\n", "ax.legend(loc=\"best\");" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Autoregressive Moving Average (ARMA)\n", "http://www.statsmodels.org/dev/examples/notebooks/generated/tsa_arma_0.html" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T18:02:04.846885Z", "start_time": "2018-11-08T18:02:04.661132Z" } }, "outputs": [], "source": [ "dta = sm.datasets.sunspots.load_pandas().data\n", "dta.index = pd.Index(sm.tsa.datetools.dates_from_range('1700', '2008'))\n", "del dta[\"YEAR\"]\n", "dta.plot()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T18:02:54.913119Z", "start_time": "2018-11-08T18:02:54.682118Z" } }, "outputs": [], "source": [ " _ = sm.graphics.tsa.plot_acf(dta.values.squeeze(), lags=40)\n", "_ = sm.graphics.tsa.plot_pacf(dta, lags=40)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T18:05:12.279648Z", "start_time": "2018-11-08T18:05:12.194182Z" } }, "outputs": [], "source": [ "arma_mod20 = sm.tsa.ARMA(dta, (2,0)).fit(disp=False)\n", "print(arma_mod20.params)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T18:07:25.650857Z", "start_time": "2018-11-08T18:07:25.527014Z" } }, "outputs": [], "source": [ "arma_mod30 = sm.tsa.ARMA(dta, (3,0)).fit(disp=False)\n", "print(arma_mod30.params)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T18:07:26.046296Z", "start_time": "2018-11-08T18:07:26.040966Z" } }, "outputs": [], "source": [ "print(arma_mod20.aic, arma_mod20.bic, arma_mod20.hqic)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T18:07:26.711582Z", "start_time": "2018-11-08T18:07:26.707818Z" } }, "outputs": [], "source": [ "print(arma_mod30.aic, arma_mod30.bic, arma_mod30.hqic)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T18:07:50.962394Z", "start_time": "2018-11-08T18:07:50.949200Z" } }, "outputs": [], "source": [ "sm.stats.durbin_watson(arma_mod30.resid.values)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T18:07:55.286855Z", "start_time": "2018-11-08T18:07:55.283054Z" } }, "outputs": [], "source": [ "arma_mod30.model.endog_names" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T18:07:56.594253Z", "start_time": "2018-11-08T18:07:56.590935Z" } }, "outputs": [], "source": [ "arma_mod30.model.exog_names" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T18:08:09.960714Z", "start_time": "2018-11-08T18:08:09.729586Z" } }, "outputs": [], "source": [ "_ = arma_mod20.plot_predict()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T18:07:58.586793Z", "start_time": "2018-11-08T18:07:58.407530Z" } }, "outputs": [], "source": [ "_ = arma_mod30.plot_predict()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T18:08:23.032832Z", "start_time": "2018-11-08T18:08:22.866801Z" } }, "outputs": [], "source": [ "arma_mod20.resid.plot()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T18:08:24.114883Z", "start_time": "2018-11-08T18:08:23.960027Z" } }, "outputs": [], "source": [ "arma_mod30.resid.plot()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T18:08:39.386774Z", "start_time": "2018-11-08T18:08:39.265069Z" } }, "outputs": [], "source": [ "plt.plot(arma_mod30.forecast(100)[0])\n", "plt.plot(arma_mod30.forecast(100)[2])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Seasonal Decompose" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T18:09:11.738439Z", "start_time": "2018-11-08T18:09:11.474671Z" }, "scrolled": true }, "outputs": [], "source": [ "co2 = pd.DataFrame(sm.datasets.co2.data.load().data)\n", "co2.index = pd.to_datetime(co2['date'].map(lambda x: x.decode(\"utf-8\") ))\n", "del co2['date']\n", "co2.plot()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T18:10:27.654384Z", "start_time": "2018-11-08T18:10:27.436381Z" } }, "outputs": [], "source": [ "co2.fillna(method='ffill', inplace=True)\n", "co2.plot()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T18:11:13.683857Z", "start_time": "2018-11-08T18:11:13.012664Z" } }, "outputs": [], "source": [ "decomp = sm.tsa.seasonal_decompose(co2, model='additive')\n", "additive = pd.DataFrame(index=co2.index)\n", "additive['Real'] = decomp.observed\n", "additive['Trend'] = decomp.trend\n", "additive['Seasonal'] = decomp.seasonal\n", "additive['Resid'] = decomp.resid\n", "\n", "_ = additive.plot(subplots=True, layout=(2, 2), figsize=(20, 10))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T18:11:14.493299Z", "start_time": "2018-11-08T18:11:13.685594Z" } }, "outputs": [], "source": [ "decomp = sm.tsa.seasonal_decompose(co2, model='multiplicative')\n", "multiplicative = pd.DataFrame(index=co2.index)\n", "multiplicative['Real'] = decomp.observed\n", "multiplicative['Trend'] = decomp.trend\n", "multiplicative['Seasonal'] = decomp.seasonal\n", "multiplicative['Resid'] = decomp.resid\n", "\n", "_ = multiplicative.plot(subplots=True, layout=(2, 2), figsize=(20, 10))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T18:11:58.488744Z", "start_time": "2018-11-08T18:11:58.217275Z" } }, "outputs": [], "source": [ "models = pd.DataFrame(index=co2.index)\n", "models['Real'] = decomp.observed\n", "models['Additive'] = additive['Trend'] + additive['Seasonal']\n", "models['Multiplicative'] = multiplicative['Trend']*multiplicative['Seasonal']\n", "models.plot()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T18:12:16.534263Z", "start_time": "2018-11-08T18:12:16.529540Z" } }, "outputs": [], "source": [ "np.mean((models['Real'] - models['Additive'])**2)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T18:12:16.978757Z", "start_time": "2018-11-08T18:12:16.973133Z" } }, "outputs": [], "source": [ "np.mean((models['Real'] - models['Multiplicative'])**2)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# PyMC3: Bayesian Statistical Modeling and Probabilistic Machine Learning " ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T18:20:41.168892Z", "start_time": "2018-11-08T18:20:40.960336Z" } }, "outputs": [], "source": [ "%matplotlib inline\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "import seaborn as sb\n", "import os\n", "os.environ['MKL_THREADING_LAYER'] = 'GNU'\n", "import pymc3 as pm\n", "\n", "data = 1.43 + 1.23*np.random.randn(1000)\n", "\n", "with pm.Model() as model:\n", " mu = pm.Normal('mu', mu=0, sd=1)\n", " sd = pm.Exponential('sd', lam=1)\n", " obs = pm.Normal('obs', mu=mu, sd=sd, observed=data)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T18:20:41.196052Z", "start_time": "2018-11-08T18:20:41.192727Z" } }, "outputs": [], "source": [ "model" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T18:20:41.416818Z", "start_time": "2018-11-08T18:20:41.413312Z" } }, "outputs": [], "source": [ "model.basic_RVs" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T18:20:41.639120Z", "start_time": "2018-11-08T18:20:41.636009Z" } }, "outputs": [], "source": [ "model.free_RVs" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T18:20:41.823125Z", "start_time": "2018-11-08T18:20:41.820014Z" } }, "outputs": [], "source": [ "model.observed_RVs" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T18:20:42.051862Z", "start_time": "2018-11-08T18:20:42.048257Z" } }, "outputs": [], "source": [ "model.deterministics" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T18:20:42.393820Z", "start_time": "2018-11-08T18:20:42.389686Z" } }, "outputs": [], "source": [ "p = model.test_point.copy()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T18:20:42.778045Z", "start_time": "2018-11-08T18:20:42.774292Z" } }, "outputs": [], "source": [ "p['mu'] = 1.45" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T18:20:43.913569Z", "start_time": "2018-11-08T18:20:43.307158Z" } }, "outputs": [], "source": [ "with model:\n", " p = pm.find_MAP()\n", "print(p)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T18:20:44.632867Z", "start_time": "2018-11-08T18:20:43.914966Z" } }, "outputs": [], "source": [ "with model:\n", " p = pm.find_MAP(start=p)\n", "print(p)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T18:20:45.252284Z", "start_time": "2018-11-08T18:20:44.634669Z" } }, "outputs": [], "source": [ "import scipy as sp\n", "\n", "\n", "with model:\n", " p = pm.find_MAP(fmin=sp.optimize.fmin_powell)\n", "print(p)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T18:21:29.111581Z", "start_time": "2018-11-08T18:21:26.975434Z" } }, "outputs": [], "source": [ "with model:\n", " trace = pm.sample(1000)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T18:22:00.827496Z", "start_time": "2018-11-08T18:22:00.823368Z" } }, "outputs": [], "source": [ "trace" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T18:22:07.600108Z", "start_time": "2018-11-08T18:22:07.594363Z" } }, "outputs": [], "source": [ "trace.varnames" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T18:22:09.224776Z", "start_time": "2018-11-08T18:22:09.218686Z" } }, "outputs": [], "source": [ "trace['mu']" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T18:22:17.543800Z", "start_time": "2018-11-08T18:22:17.129996Z" } }, "outputs": [], "source": [ "pm.traceplot(trace)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T18:23:28.267025Z", "start_time": "2018-11-08T18:23:27.807306Z" } }, "outputs": [], "source": [ "_ = pm.traceplot(trace[500:])\n", "pm.summary(trace[500:])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T18:25:24.255912Z", "start_time": "2018-11-08T18:25:22.959167Z" } }, "outputs": [], "source": [ "with model:\n", " trace_m = pm.sample(1000, step=pm.Metropolis())" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T18:25:29.828718Z", "start_time": "2018-11-08T18:25:29.382967Z" } }, "outputs": [], "source": [ "_ = pm.traceplot(trace_m[500:])\n", "pm.summary(trace_m[500:])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T18:25:43.563492Z", "start_time": "2018-11-08T18:25:41.265997Z" } }, "outputs": [], "source": [ "with model:\n", " trace_s = pm.sample(1000, step=pm.Slice(), njobs=5)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T18:25:51.960296Z", "start_time": "2018-11-08T18:25:51.449060Z" } }, "outputs": [], "source": [ "_ = pm.traceplot(trace_s[500:])\n", "pm.summary(trace_s[500:])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T18:26:51.323440Z", "start_time": "2018-11-08T18:26:51.091582Z" } }, "outputs": [], "source": [ "with model:\n", " post_pred = pm.sample_ppc(trace_s[500:], samples=500, size=len(data))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T18:26:54.356677Z", "start_time": "2018-11-08T18:26:54.195708Z" }, "scrolled": false }, "outputs": [], "source": [ "plt.figure()\n", "ax = sb.distplot(post_pred['obs'].mean(axis=1), label='Posterior predictive means')\n", "ax.axvline(data.mean(), color='r', ls='--', label='True mean')\n", "ax.legend()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Scikit Learn: Machine Learning in Python\n", "## Clustering" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T18:30:51.579127Z", "start_time": "2018-11-08T18:30:51.560908Z" }, "scrolled": true }, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "import sklearn as sk\n", "\n", "from sklearn import cluster, mixture\n", "n_samples = 500\n", "\n", "C = np.array([[0., -0.1, 3.0], [1.7, .4, 3.1]])\n", "X = np.concatenate([np.dot(np.random.randn(n_samples, 2), C),\n", " .7 * np.random.randn(n_samples, 3) + np.array([-6, 3, 2])])\n", "X = pd.DataFrame(X, columns=['x1', 'x2', 'x3'])\n", "X" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T18:31:48.219012Z", "start_time": "2018-11-08T18:31:48.192357Z" } }, "outputs": [], "source": [ "kmeans = cluster.KMeans(n_clusters=2).fit(X)\n", "kmeans" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T18:31:59.371293Z", "start_time": "2018-11-08T18:31:59.364590Z" }, "scrolled": true }, "outputs": [], "source": [ "kmeans.predict(X)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T18:32:12.928767Z", "start_time": "2018-11-08T18:32:12.924334Z" } }, "outputs": [], "source": [ "kmeans.cluster_centers_" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T18:32:47.672993Z", "start_time": "2018-11-08T18:32:47.665185Z" } }, "outputs": [], "source": [ "%matplotlib inline\n", "import itertools\n", "import matplotlib.pyplot as plt\n", "\n", "\n", "def plot_clusters(X, cluster, title):\n", " for ids in [[0, 1], [0, 2], [1, 2]]:\n", " plt.figure(figsize=(15, 3))\n", " plot = plt.subplot(111)\n", " color_iter = itertools.cycle(['navy', 'c', 'cornflowerblue', 'gold', 'darkorange', 'r', 'g', 'k', 'm'])\n", " for i, color in zip(range(min(cluster), max(cluster)+1), color_iter):\n", " if not np.any(cluster == i):\n", " continue\n", " plt.scatter(X[cluster == i, ids[0]], X[cluster == i, ids[1]], .8, color=color)\n", " plt.title(title + str(ids))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T18:32:52.701958Z", "start_time": "2018-11-08T18:32:52.388694Z" } }, "outputs": [], "source": [ "plot_clusters(X.values, kmeans.predict(X), 'KMeans')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T18:33:25.557712Z", "start_time": "2018-11-08T18:33:25.123186Z" } }, "outputs": [], "source": [ "kmeans = cluster.KMeans(n_clusters=10).fit(X)\n", "\n", "plot_clusters(X.values, kmeans.predict(X), 'KMeans')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T18:34:48.761611Z", "start_time": "2018-11-08T18:34:48.359539Z" } }, "outputs": [], "source": [ "agglomerative = cluster.AgglomerativeClustering(n_clusters=10, linkage='ward').fit(X)\n", "\n", "plot_clusters(X.values, agglomerative.fit_predict(X), 'AgglomerativeClustering')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T18:35:01.963545Z", "start_time": "2018-11-08T18:35:01.576697Z" } }, "outputs": [], "source": [ "birch = cluster.Birch(branching_factor=50, n_clusters=10, threshold=0.5, compute_labels=True).fit(X)\n", "plot_clusters(X.values, birch.predict(X), 'Birch')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T18:35:17.097851Z", "start_time": "2018-11-08T18:35:10.890679Z" } }, "outputs": [], "source": [ "affinity = cluster.AffinityPropagation(preference=-100).fit(X)\n", "\n", "plot_clusters(X.values, affinity.predict(X), 'AffinityPropagation')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T18:36:07.575077Z", "start_time": "2018-11-08T18:36:07.209363Z" } }, "outputs": [], "source": [ "dbscan = cluster.DBSCAN(eps=0.5, min_samples=10).fit(X)\n", "plot_clusters(X.values, dbscan.fit_predict(X), 'DBSCAN')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T18:36:23.715257Z", "start_time": "2018-11-08T18:36:22.284064Z" } }, "outputs": [], "source": [ "bandwidth = cluster.estimate_bandwidth(X.values, quantile=0.01, n_samples=500)\n", "meanshift = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True).fit(X)\n", "\n", "plot_clusters(X.values, meanshift.predict(X), 'MeanShift')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T18:36:34.799567Z", "start_time": "2018-11-08T18:36:34.387441Z" } }, "outputs": [], "source": [ "gmm = mixture.GaussianMixture(n_components=10, covariance_type='full').fit(X)\n", "plot_clusters(X.values, gmm.predict(X), 'Gaussian Mixture')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T18:36:55.939105Z", "start_time": "2018-11-08T18:36:55.447435Z" } }, "outputs": [], "source": [ "dpgmm = mixture.BayesianGaussianMixture(n_components=10, covariance_type='full').fit(X)\n", "plot_clusters(X.values, dpgmm.predict(X), 'Bayesian Gaussian Mixture with a Dirichlet process prior')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Classification" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T18:38:06.533971Z", "start_time": "2018-11-08T18:38:05.877782Z" } }, "outputs": [], "source": [ "import numpy as np\n", "import matplotlib.pyplot as plt\n", "from matplotlib.colors import ListedColormap\n", "from sklearn import datasets\n", "from sklearn.neighbors import NearestCentroid\n", "\n", "n_neighbors = 15\n", "\n", "# import some data to play with\n", "iris = datasets.load_iris()\n", "# we only take the first two features. We could avoid this ugly\n", "# slicing by using a two-dim dataset\n", "X = iris.data[:, :2]\n", "y = iris.target\n", "\n", "h = .02 # step size in the mesh\n", "\n", "# Create color maps\n", "cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])\n", "cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])\n", "\n", "for shrinkage in [None, 1.0, .2, 0.1]:\n", " # we create an instance of Neighbours Classifier and fit the data.\n", " clf = NearestCentroid(shrink_threshold=shrinkage)\n", " clf.fit(X, y)\n", " y_pred = clf.predict(X)\n", "\n", " # Plot the decision boundary. For that, we will assign a color to each\n", " # point in the mesh [x_min, x_max]x[y_min, y_max].\n", " x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1\n", " y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1\n", " xx, yy = np.meshgrid(np.arange(x_min, x_max, h),\n", " np.arange(y_min, y_max, h))\n", " Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])\n", "\n", " # Put the result into a color plot\n", " Z = Z.reshape(xx.shape)\n", " plt.figure(figsize=(10, 5))\n", " plt.pcolormesh(xx, yy, Z, cmap=cmap_light)\n", "\n", " # Plot also the training points\n", " plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold,\n", " edgecolor='b', s=20)\n", " plt.title(\"3-Class classification (shrink_threshold={}, precision={})\".format(shrinkage, np.mean(y == y_pred)))\n", " plt.axis('tight')\n", "\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T18:40:59.645937Z", "start_time": "2018-11-08T18:40:59.042862Z" } }, "outputs": [], "source": [ "import numpy as np\n", "import matplotlib.pyplot as plt\n", "from matplotlib.colors import ListedColormap\n", "from sklearn import neighbors, datasets\n", "\n", "n_neighbors = 15\n", "\n", "# import some data to play with\n", "iris = datasets.load_iris()\n", "\n", "# we only take the first two features. We could avoid this ugly\n", "# slicing by using a two-dim dataset\n", "X = iris.data[:, :2]\n", "y = iris.target\n", "\n", "h = .02 # step size in the mesh\n", "\n", "# Create color maps\n", "cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])\n", "cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])\n", "\n", "for weights in ['uniform', 'distance']:\n", " # we create an instance of Neighbours Classifier and fit the data.\n", " clf = neighbors.KNeighborsClassifier(n_neighbors, weights=weights)\n", " clf.fit(X, y)\n", "\n", " # Plot the decision boundary. For that, we will assign a color to each\n", " # point in the mesh [x_min, x_max]x[y_min, y_max].\n", " x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1\n", " y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1\n", " xx, yy = np.meshgrid(np.arange(x_min, x_max, h),\n", " np.arange(y_min, y_max, h))\n", " Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])\n", "\n", " # Put the result into a color plot\n", " Z = Z.reshape(xx.shape)\n", " plt.figure(figsize=(10, 5))\n", " plt.pcolormesh(xx, yy, Z, cmap=cmap_light)\n", "\n", " # Plot also the training points\n", " plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold,\n", " edgecolor='k', s=20)\n", " plt.xlim(xx.min(), xx.max())\n", " plt.ylim(yy.min(), yy.max())\n", " plt.title(\"3-Class classification (k = %i, weights = '%s')\" % (n_neighbors, weights))\n", "\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T18:44:36.647838Z", "start_time": "2018-11-08T18:44:29.575452Z" } }, "outputs": [], "source": [ "import numpy as np\n", "import matplotlib.pyplot as plt\n", "from matplotlib.colors import ListedColormap\n", "\n", "from sklearn import clone\n", "from sklearn.datasets import load_iris\n", "from sklearn.ensemble import (RandomForestClassifier, ExtraTreesClassifier,\n", " AdaBoostClassifier)\n", "from sklearn.tree import DecisionTreeClassifier\n", "\n", "# Parameters\n", "n_classes = 3\n", "n_estimators = 30\n", "cmap = plt.cm.RdYlBu\n", "plot_step = 0.02 # fine step width for decision surface contours\n", "plot_step_coarser = 0.5 # step widths for coarse classifier guesses\n", "RANDOM_SEED = 13 # fix the seed on each iteration\n", "\n", "# Load data\n", "iris = load_iris()\n", "\n", "plot_idx = 1\n", "\n", "models = [DecisionTreeClassifier(max_depth=None),\n", " RandomForestClassifier(n_estimators=n_estimators),\n", " ExtraTreesClassifier(n_estimators=n_estimators),\n", " AdaBoostClassifier(DecisionTreeClassifier(max_depth=3),\n", " n_estimators=n_estimators)]\n", "plt.figure(figsize=(20, 15))\n", "for pair in ([0, 1], [0, 2], [2, 3]):\n", " for model in models:\n", " # We only take the two corresponding features\n", " X = iris.data[:, pair]\n", " y = iris.target\n", "\n", " # Shuffle\n", " idx = np.arange(X.shape[0])\n", " np.random.seed(RANDOM_SEED)\n", " np.random.shuffle(idx)\n", " X = X[idx]\n", " y = y[idx]\n", "\n", " # Standardize\n", " mean = X.mean(axis=0)\n", " std = X.std(axis=0)\n", " X = (X - mean) / std\n", "\n", " # Train\n", " clf = clone(model)\n", " clf = model.fit(X, y)\n", "\n", " scores = clf.score(X, y)\n", " # Create a title for each column and the console by using str() and\n", " # slicing away useless parts of the string\n", " model_title = str(type(model)).split(\n", " \".\")[-1][:-2][:-len(\"Classifier\")]\n", "\n", " model_details = model_title\n", " if hasattr(model, \"estimators_\"):\n", " model_details += \" with {} estimators\".format(\n", " len(model.estimators_))\n", " print(model_details + \" with features\", pair,\n", " \"has a score of\", scores)\n", "\n", " plt.subplot(3, 4, plot_idx)\n", " if plot_idx <= len(models):\n", " # Add a title at the top of each column\n", " plt.title(model_title)\n", "\n", " # Now plot the decision boundary using a fine mesh as input to a\n", " # filled contour plot\n", " x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1\n", " y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1\n", " xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step),\n", " np.arange(y_min, y_max, plot_step))\n", "\n", " # Plot either a single DecisionTreeClassifier or alpha blend the\n", " # decision surfaces of the ensemble of classifiers\n", " if isinstance(model, DecisionTreeClassifier):\n", " Z = model.predict(np.c_[xx.ravel(), yy.ravel()])\n", " Z = Z.reshape(xx.shape)\n", " cs = plt.contourf(xx, yy, Z, cmap=cmap)\n", " else:\n", " # Choose alpha blend level with respect to the number\n", " # of estimators\n", " # that are in use (noting that AdaBoost can use fewer estimators\n", " # than its maximum if it achieves a good enough fit early on)\n", " estimator_alpha = 1.0 / len(model.estimators_)\n", " for tree in model.estimators_:\n", " Z = tree.predict(np.c_[xx.ravel(), yy.ravel()])\n", " Z = Z.reshape(xx.shape)\n", " cs = plt.contourf(xx, yy, Z, alpha=estimator_alpha, cmap=cmap)\n", "\n", " # Build a coarser grid to plot a set of ensemble classifications\n", " # to show how these are different to what we see in the decision\n", " # surfaces. These points are regularly space and do not have a\n", " # black outline\n", " xx_coarser, yy_coarser = np.meshgrid(\n", " np.arange(x_min, x_max, plot_step_coarser),\n", " np.arange(y_min, y_max, plot_step_coarser))\n", " Z_points_coarser = model.predict(np.c_[xx_coarser.ravel(),\n", " yy_coarser.ravel()]\n", " ).reshape(xx_coarser.shape)\n", " cs_points = plt.scatter(xx_coarser, yy_coarser, s=15,\n", " c=Z_points_coarser, cmap=cmap,\n", " edgecolors=\"none\")\n", "\n", " # Plot the training points, these are clustered together and have a\n", " # black outline\n", " plt.scatter(X[:, 0], X[:, 1], c=y,\n", " cmap=ListedColormap(['r', 'y', 'b']),\n", " edgecolor='k', s=20)\n", " plot_idx += 1 # move on to the next plot in sequence\n", "\n", "plt.suptitle(\"Classifiers on feature subsets of the Iris dataset\")\n", "plt.axis(\"tight\")\n", "\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Dimensionality Reduction" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T18:48:15.700893Z", "start_time": "2018-11-08T18:48:15.441447Z" } }, "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", "\n", "from sklearn import datasets\n", "from sklearn.decomposition import PCA\n", "from sklearn.discriminant_analysis import LinearDiscriminantAnalysis\n", "\n", "iris = datasets.load_iris()\n", "\n", "X = iris.data\n", "y = iris.target\n", "target_names = iris.target_names\n", "\n", "pca = PCA(n_components=2)\n", "X_r = pca.fit(X).transform(X)\n", "\n", "lda = LinearDiscriminantAnalysis(n_components=2)\n", "X_r2 = lda.fit(X, y).transform(X)\n", "\n", "# Percentage of variance explained for each components\n", "print('explained variance ratio (first two components): %s' % str(pca.explained_variance_ratio_))\n", "\n", "plt.figure()\n", "colors = ['navy', 'turquoise', 'darkorange']\n", "lw = 2\n", "\n", "for color, i, target_name in zip(colors, [0, 1, 2], target_names):\n", " plt.scatter(X_r[y == i, 0], X_r[y == i, 1], color=color, alpha=.8, lw=lw,\n", " label=target_name)\n", "plt.legend(loc='best', shadow=False, scatterpoints=1)\n", "plt.title('PCA of IRIS dataset')\n", "\n", "plt.figure()\n", "for color, i, target_name in zip(colors, [0, 1, 2], target_names):\n", " plt.scatter(X_r2[y == i, 0], X_r2[y == i, 1], alpha=.8, color=color,\n", " label=target_name)\n", "plt.legend(loc='best', shadow=False, scatterpoints=1)\n", "plt.title('LDA of IRIS dataset')\n", "\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T18:50:10.108399Z", "start_time": "2018-11-08T18:50:09.762862Z" } }, "outputs": [], "source": [ "import numpy as np\n", "import matplotlib.pyplot as plt\n", "\n", "from sklearn.decomposition import PCA, KernelPCA\n", "from sklearn.datasets import make_circles\n", "\n", "np.random.seed(0)\n", "\n", "X, y = make_circles(n_samples=400, factor=.3, noise=.05)\n", "\n", "plt.figure(figsize=(12, 12))\n", "plt.subplot(2, 2, 1, aspect='equal')\n", "plt.title(\"Original space\")\n", "reds = y == 0\n", "blues = y == 1\n", "plt.scatter(X[reds, 0], X[reds, 1], c=\"red\", s=20, edgecolor='k')\n", "plt.scatter(X[blues, 0], X[blues, 1], c=\"blue\", s=20, edgecolor='k')\n", "plt.xlabel(\"$x_1$\")\n", "plt.ylabel(\"$x_2$\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T18:52:36.591746Z", "start_time": "2018-11-08T18:52:36.148517Z" } }, "outputs": [], "source": [ "kpca = KernelPCA(kernel=\"rbf\", fit_inverse_transform=True, gamma=10)\n", "X_kpca = kpca.fit_transform(X)\n", "X_back = kpca.inverse_transform(X_kpca)\n", "pca = PCA()\n", "X_pca = pca.fit_transform(X)\n", "\n", "# Plot results\n", "\n", "plt.figure(figsize=(12, 12))\n", "plt.subplot(2, 2, 1, aspect='equal')\n", "plt.title(\"Original space\")\n", "reds = y == 0\n", "blues = y == 1\n", "\n", "plt.scatter(X[reds, 0], X[reds, 1], c=\"red\", s=20, edgecolor='k')\n", "plt.scatter(X[blues, 0], X[blues, 1], c=\"blue\", s=20, edgecolor='k')\n", "plt.xlabel(\"$x_1$\")\n", "plt.ylabel(\"$x_2$\")\n", "\n", "X1, X2 = np.meshgrid(np.linspace(-1.5, 1.5, 50), np.linspace(-1.5, 1.5, 50))\n", "X_grid = np.array([np.ravel(X1), np.ravel(X2)]).T\n", "# projection on the first principal component (in the phi space)\n", "Z_grid = kpca.transform(X_grid)[:, 0].reshape(X1.shape)\n", "plt.contour(X1, X2, Z_grid, colors='grey', linewidths=1, origin='lower')\n", "\n", "plt.subplot(2, 2, 2, aspect='equal')\n", "plt.scatter(X_pca[reds, 0], X_pca[reds, 1], c=\"red\", s=20, edgecolor='k')\n", "plt.scatter(X_pca[blues, 0], X_pca[blues, 1], c=\"blue\", s=20, edgecolor='k')\n", "plt.title(\"Projection by PCA\")\n", "plt.xlabel(\"1st principal component\")\n", "plt.ylabel(\"2nd component\")\n", "\n", "plt.subplot(2, 2, 3, aspect='equal')\n", "plt.scatter(X_kpca[reds, 0], X_kpca[reds, 1], c=\"red\", s=20, edgecolor='k')\n", "plt.scatter(X_kpca[blues, 0], X_kpca[blues, 1], c=\"blue\", s=20, edgecolor='k')\n", "plt.title(\"Projection by KPCA\")\n", "plt.xlabel(\"1st principal component in space induced by $\\phi$\")\n", "plt.ylabel(\"2nd component\")\n", "\n", "plt.subplot(2, 2, 4, aspect='equal')\n", "plt.scatter(X_back[reds, 0], X_back[reds, 1], c=\"red\", s=20, edgecolor='k')\n", "plt.scatter(X_back[blues, 0], X_back[blues, 1], c=\"blue\", s=20, edgecolor='k')\n", "plt.title(\"Original space after inverse transform\")\n", "plt.xlabel(\"$x_1$\")\n", "plt.ylabel(\"$x_2$\")\n", "\n", "plt.subplots_adjust(0.02, 0.10, 0.98, 0.94, 0.04, 0.35)\n", "\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "anaconda-cloud": {}, "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.0" }, "toc": { "base_numbering": 1, "nav_menu": {}, "number_sections": true, "sideBar": true, "skip_h1_title": false, "title_cell": "Table of Contents", "title_sidebar": "Contents", "toc_cell": false, "toc_position": {}, "toc_section_display": true, "toc_window_display": true } }, "nbformat": 4, "nbformat_minor": 1 }