{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Pandas: A Data Analysis Library\n", "http://pandas.pydata.org/pandas-docs/stable/10min.html#min\n", "\n", "http://pandas.pydata.org/pandas-docs/stable/tutorials.html" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T19:21:33.052925Z", "start_time": "2018-10-25T19:21:32.250149Z" } }, "outputs": [], "source": [ "import pandas as pd\n", "from IPython.display import display # para que al imprimir tablas con print() salgan bonitas\n", "import numpy as np\n", "import matplotlib.pyplot as plt" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Se crea una serie pasando una lista de valores. Por default pandas crea un índice entero:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T19:22:42.792462Z", "start_time": "2018-10-23T19:22:42.782185Z" } }, "outputs": [], "source": [ "s = pd.Series([1,3,5,np.nan, 6,8], dtype=np.float32)\n", "s" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Para crear un 'DataFrame' se entrega un 'numpy array', con un índice de 'datatimes' y columnas con etiquetas:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T19:24:55.444734Z", "start_time": "2018-10-23T19:24:55.433188Z" } }, "outputs": [], "source": [ "dates = pd.date_range('20170101', periods = 6, freq='W-MON')\n", "display(dates)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T19:25:40.462401Z", "start_time": "2018-10-23T19:25:40.444917Z" } }, "outputs": [], "source": [ "df = pd.DataFrame(np.random.randn(6,4), index = dates, columns = list('ABCD'))\n", "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Crear un 'dataframe' pasando un diccionario de objetos que pueden ser convertidos a una serie" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T19:26:07.132041Z", "start_time": "2018-10-23T19:26:07.110967Z" }, "scrolled": true }, "outputs": [], "source": [ "df2 = pd.DataFrame({'A': 1.,\n", " 'B': pd.Timestamp('20170713'),\n", " 'C': pd.Series(1,index=list(range(4)),dtype='float32'),\n", " 'D': np.array([3]*4, dtype='int32'),\n", " 'E': pd.Categorical([\"test\",\"train\",\"test\",\"train\"]),\n", " 'F': 'foo'})\n", "\n", "df2" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Con 'dtype' específicos" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Viendo los datos\n", "\n", "Ver la parte superior o inferior de una tabla" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T19:26:58.110241Z", "start_time": "2018-10-23T19:26:58.094461Z" } }, "outputs": [], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T19:27:05.477763Z", "start_time": "2018-10-23T19:27:05.457995Z" } }, "outputs": [], "source": [ "df.tail(3)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Mostrar el índice, las columnas, los datos e información adicional" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T19:27:37.702606Z", "start_time": "2018-10-23T19:27:37.677970Z" } }, "outputs": [], "source": [ "display(df.index)\n", "display(df.columns)\n", "display(df.values)\n", "display(df.shape)\n", "display(df.count())\n", "df.info()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Viendo un resumen de los datos:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T19:29:23.401332Z", "start_time": "2018-10-23T19:29:23.373649Z" } }, "outputs": [], "source": [ "df.describe()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Trasponiendo los datos" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T19:29:32.921852Z", "start_time": "2018-10-23T19:29:32.898273Z" } }, "outputs": [], "source": [ "df.T" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T19:30:12.881916Z", "start_time": "2018-10-23T19:30:12.849870Z" } }, "outputs": [], "source": [ "df.describe().T" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T19:30:38.557938Z", "start_time": "2018-10-23T19:30:38.522257Z" } }, "outputs": [], "source": [ "df.T.describe().T" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Ordenando por ejes:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T19:31:11.692802Z", "start_time": "2018-10-23T19:31:11.674586Z" } }, "outputs": [], "source": [ "df.sort_index(axis=0, ascending=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T19:31:39.158372Z", "start_time": "2018-10-23T19:31:39.141153Z" } }, "outputs": [], "source": [ "df.sort_values(by='C', ascending=False).sort_index(axis=1, ascending=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T19:35:16.732957Z", "start_time": "2018-10-23T19:35:16.718124Z" } }, "outputs": [], "source": [ "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Selección\n", "\n", "Obtener una columa sola" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T19:36:54.269183Z", "start_time": "2018-10-23T19:36:54.262310Z" }, "scrolled": true }, "outputs": [], "source": [ "df['D']" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T19:39:18.899666Z", "start_time": "2018-10-23T19:39:18.887686Z" }, "scrolled": true }, "outputs": [], "source": [ "df.A" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Usando [ ] podemos sacar tajadas (filas)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T19:37:37.283674Z", "start_time": "2018-10-23T19:37:37.269345Z" } }, "outputs": [], "source": [ "df[0:3]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T19:37:53.796546Z", "start_time": "2018-10-23T19:37:53.782236Z" } }, "outputs": [], "source": [ "df['20170103':'20170205']" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Seleccionado por etiquetas:\n", "\n", "Para obtener una sección usando etiquetas, y una selección multi-eje:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T19:39:38.317353Z", "start_time": "2018-10-23T19:39:38.307784Z" } }, "outputs": [], "source": [ "dates[0]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T19:40:02.209323Z", "start_time": "2018-10-23T19:40:02.198900Z" } }, "outputs": [], "source": [ "df.loc[dates[0]]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T19:40:31.899471Z", "start_time": "2018-10-23T19:40:31.886415Z" } }, "outputs": [], "source": [ "df.loc[:,['A','B']]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Mostrando un corte por etiqueta, especificando los puntos de término:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T19:45:48.690230Z", "start_time": "2018-10-23T19:45:48.674380Z" } }, "outputs": [], "source": [ "df.loc['20170102':'20170204',['A','B']]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Reduciendo la dimensión del objeto entregado:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T19:42:43.798868Z", "start_time": "2018-10-23T19:42:43.785406Z" } }, "outputs": [], "source": [ "df.loc['20170102',['A','B']]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Para obtener un valor escalar" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T19:43:09.818789Z", "start_time": "2018-10-23T19:42:56.291800Z" } }, "outputs": [], "source": [ "%timeit df.loc[dates[0],'A']" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Para obtener rápido acceso a un escalar " ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T19:43:19.939920Z", "start_time": "2018-10-23T19:43:09.820892Z" } }, "outputs": [], "source": [ "%timeit df.at[dates[0],'A']" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Selección por posición\n", "\n", "Se selecciona la posición de los indices entregados" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T19:43:51.803284Z", "start_time": "2018-10-23T19:43:51.788943Z" } }, "outputs": [], "source": [ "df" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T19:44:07.606333Z", "start_time": "2018-10-23T19:44:07.597272Z" } }, "outputs": [], "source": [ "df.iloc[3]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Cortes usando enteros actúa igual que numpy" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T19:44:09.580121Z", "start_time": "2018-10-23T19:44:09.567799Z" } }, "outputs": [], "source": [ "df.iloc[3:5,0:2]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Una lista de posiciones enteras:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T19:44:12.378112Z", "start_time": "2018-10-23T19:44:12.361018Z" } }, "outputs": [], "source": [ "df.iloc[[1,2,4],[0,2]]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Cortes explícitos de filas y columnas:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T19:44:24.708223Z", "start_time": "2018-10-23T19:44:24.690871Z" } }, "outputs": [], "source": [ "df.iloc[1:3,:]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T19:44:25.354652Z", "start_time": "2018-10-23T19:44:25.345998Z" } }, "outputs": [], "source": [ "df.iloc[:,1:3]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Para acceder a un valor explícito" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T19:44:26.829103Z", "start_time": "2018-10-23T19:44:26.821835Z" } }, "outputs": [], "source": [ "df.iloc[1,1]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Para obtener rápido acceso a un escalar:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T19:44:31.303757Z", "start_time": "2018-10-23T19:44:31.295198Z" } }, "outputs": [], "source": [ "df.iat[1,1]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Indices booleanos\n", "Usando los valores de una columna para extraer datos, cuando una condición se satisface:\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T19:46:37.511554Z", "start_time": "2018-10-23T19:46:37.495666Z" } }, "outputs": [], "source": [ "df" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T19:46:50.187028Z", "start_time": "2018-10-23T19:46:50.177323Z" } }, "outputs": [], "source": [ "df.A>0" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T19:47:11.450514Z", "start_time": "2018-10-23T19:47:11.434932Z" } }, "outputs": [], "source": [ "df[df.A>0]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Se usa la función 'isin()' para filtrar:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T19:47:58.039461Z", "start_time": "2018-10-23T19:47:58.019639Z" } }, "outputs": [], "source": [ "df2 = df.copy()\n", "df2['E']=['one','one','two','three','four','three']\n", "df2" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T19:49:03.014211Z", "start_time": "2018-10-23T19:49:03.003675Z" } }, "outputs": [], "source": [ "df2.E.isin(['two','four'])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T19:49:22.631701Z", "start_time": "2018-10-23T19:49:22.612097Z" } }, "outputs": [], "source": [ "df2[df2.E.isin(['two','four'])]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Advanced Indexing" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T19:49:50.826104Z", "start_time": "2018-10-23T19:49:50.808098Z" } }, "outputs": [], "source": [ "df2" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T19:50:05.673735Z", "start_time": "2018-10-23T19:50:05.657880Z" } }, "outputs": [], "source": [ "df2.filter(items = ['A','D'])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T19:51:35.713118Z", "start_time": "2018-10-23T19:51:35.696233Z" } }, "outputs": [], "source": [ "df2.filter(like = '-01', axis=0)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T19:52:00.923540Z", "start_time": "2018-10-23T19:52:00.902050Z" } }, "outputs": [], "source": [ "df2.filter(regex = '2017-01-0[29]', axis=0)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T19:53:20.167814Z", "start_time": "2018-10-23T19:53:20.151178Z" } }, "outputs": [], "source": [ "df2.select(lambda x: x > pd.Timestamp('2017-01-23'))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T19:54:15.348707Z", "start_time": "2018-10-23T19:54:15.330583Z" } }, "outputs": [], "source": [ "df2.select(lambda x: x > 'B', axis=1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T19:54:23.621513Z", "start_time": "2018-10-23T19:54:23.598278Z" } }, "outputs": [], "source": [ "df2.where(df2 > 0)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T19:57:30.703064Z", "start_time": "2018-10-23T19:57:30.683930Z" } }, "outputs": [], "source": [ "df2.where(df2 < 0)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T19:57:53.075005Z", "start_time": "2018-10-23T19:57:53.055020Z" } }, "outputs": [], "source": [ "df2.query('A > B')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T19:58:06.982971Z", "start_time": "2018-10-23T19:58:06.958985Z" } }, "outputs": [], "source": [ "df2.query('C**2 > A**2 + B**2 ')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T19:58:13.820335Z", "start_time": "2018-10-23T19:58:13.802639Z" } }, "outputs": [], "source": [ "df2.query('A > B and B > C')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Configurando\n", "\n", "Setear una nueva columna alinea automáticamente los datos por sus índices:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T20:00:47.612346Z", "start_time": "2018-10-23T20:00:47.600295Z" }, "scrolled": false }, "outputs": [], "source": [ "s1 = pd.Series([1,2,3,4,5,6], index=pd.date_range('20170101', periods = 6, freq='W-MON'))\n", "display(s1)\n", "df['F'] = s1" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T20:00:48.924971Z", "start_time": "2018-10-23T20:00:48.904108Z" } }, "outputs": [], "source": [ "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Seteando valores por etiqueta:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T20:01:07.130698Z", "start_time": "2018-10-23T20:01:07.125072Z" } }, "outputs": [], "source": [ "df.at[dates[0],'A']=0" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Seteando valores por posición:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T20:01:19.612752Z", "start_time": "2018-10-23T20:01:19.607401Z" } }, "outputs": [], "source": [ "df.iat[0,1]=0" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T20:01:20.066154Z", "start_time": "2018-10-23T20:01:20.057477Z" } }, "outputs": [], "source": [ "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Seteando valores usando un 'numpy array' y mostramos el resultado" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T20:01:56.866953Z", "start_time": "2018-10-23T20:01:56.852927Z" } }, "outputs": [], "source": [ "df.loc[:,'D'] = np.array([5] * len(df))\n", "\n", "df" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T20:03:28.745255Z", "start_time": "2018-10-23T20:03:28.740725Z" } }, "outputs": [], "source": [ "df['D'] = np.array([6] * len(df))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T20:03:29.063813Z", "start_time": "2018-10-23T20:03:29.055008Z" } }, "outputs": [], "source": [ "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Una operación 'where' para setear:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T20:03:45.028682Z", "start_time": "2018-10-23T20:03:45.006526Z" } }, "outputs": [], "source": [ "df2 = df.copy()\n", "df2[df2>0] = -df2\n", "df2" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## NaN\n", "\n", "Pandas usa primariamente el valor 'np.nan' para representar datos faltantes. \n", "\n", "Reindexar permite cambiar/añadir/borrar el índice de un eje expecífico. Esto entrega una copia de los datos" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T20:05:12.588900Z", "start_time": "2018-10-23T20:05:12.568645Z" }, "scrolled": true }, "outputs": [], "source": [ "df1 = df.reindex(index=dates[0:4], columns=list(df.columns)+['E'])\n", "df1.loc[dates[0]:dates[1],'E']=1\n", "df1" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Eliminando las filas que tienen datos faltantes:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T20:05:54.865644Z", "start_time": "2018-10-23T20:05:54.847591Z" } }, "outputs": [], "source": [ "df1.dropna(how='any')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Rellenando los datos faltantes" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T20:06:52.553736Z", "start_time": "2018-10-23T20:06:52.534336Z" } }, "outputs": [], "source": [ "df1.fillna(value=-100)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Para obtener los valores booleanos cuando hay NaN" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T20:07:03.535331Z", "start_time": "2018-10-23T20:07:03.519342Z" } }, "outputs": [], "source": [ "pd.isnull(df1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T20:08:36.061791Z", "start_time": "2018-10-23T20:08:36.045279Z" } }, "outputs": [], "source": [ "df1.replace(np.nan, 'Null')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T20:09:34.165397Z", "start_time": "2018-10-23T20:09:34.155661Z" } }, "outputs": [], "source": [ "df1.replace(6, 5.999999)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T20:09:40.682682Z", "start_time": "2018-10-23T20:09:40.662736Z" } }, "outputs": [], "source": [ "df1" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T20:10:03.635989Z", "start_time": "2018-10-23T20:10:03.630077Z" } }, "outputs": [], "source": [ "df1.replace(6, 5.999999, inplace=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T20:10:16.357835Z", "start_time": "2018-10-23T20:10:16.337469Z" } }, "outputs": [], "source": [ "df1" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Stats\n", "\n", "efectuando estadísticas descriptivas" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T20:10:43.276043Z", "start_time": "2018-10-23T20:10:43.264357Z" } }, "outputs": [], "source": [ "df.mean()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Misma operación para el otro eje:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T20:10:58.424336Z", "start_time": "2018-10-23T20:10:58.412671Z" } }, "outputs": [], "source": [ "df.mean(axis=1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Operando con objetos que tienen diferente dimensionalidad necesita un alineamiento. En adición, pandas automaticamente difunde (broadcasts) en las dimensiones especificadas" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T20:13:21.009004Z", "start_time": "2018-10-23T20:13:20.999799Z" }, "scrolled": false }, "outputs": [], "source": [ "s = pd.Series([1,3,5,np.nan, 6 ,8], index=dates).shift(2)\n", "display(s)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T20:13:53.100884Z", "start_time": "2018-10-23T20:13:53.080304Z" } }, "outputs": [], "source": [ "df.sub(s, axis='index')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Iteration" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T20:15:33.859941Z", "start_time": "2018-10-23T20:15:33.852019Z" }, "scrolled": true }, "outputs": [], "source": [ "df" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T20:15:59.144949Z", "start_time": "2018-10-23T20:15:59.119609Z" } }, "outputs": [], "source": [ "for column in df.iteritems():\n", " print(type(column), len(column), column[0], type(column[1]))\n", " display(column[1])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T20:17:46.956950Z", "start_time": "2018-10-23T20:17:46.937595Z" } }, "outputs": [], "source": [ "for name,serie in df.iteritems():\n", " display(serie)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T20:17:55.842098Z", "start_time": "2018-10-23T20:17:55.806636Z" } }, "outputs": [], "source": [ "for row in df.iterrows():\n", " print(type(row), len(row), row[0], type(row[1]))\n", " display(row[1])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T20:19:27.825613Z", "start_time": "2018-10-23T20:19:27.808532Z" } }, "outputs": [], "source": [ "for frame in df.itertuples():\n", " print(type(frame), len(frame))\n", " display(frame)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Apply\n", "\n", "Aplicando funciones a los datos:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T20:19:37.341311Z", "start_time": "2018-10-23T20:19:37.323545Z" }, "scrolled": true }, "outputs": [], "source": [ "df" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T20:19:58.658612Z", "start_time": "2018-10-23T20:19:58.636089Z" }, "scrolled": true }, "outputs": [], "source": [ "df.apply(np.cumsum)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T20:20:21.105512Z", "start_time": "2018-10-23T20:20:21.083994Z" } }, "outputs": [], "source": [ "df.apply(np.cumsum, axis=1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T20:20:44.418646Z", "start_time": "2018-10-23T20:20:44.406359Z" } }, "outputs": [], "source": [ "df.apply(lambda x: x.max()-x.min())" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T20:20:50.768191Z", "start_time": "2018-10-23T20:20:50.752677Z" } }, "outputs": [], "source": [ "df.apply(lambda x: x.max()-x.min(), axis=1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T20:21:48.389549Z", "start_time": "2018-10-23T20:21:48.367749Z" }, "scrolled": true }, "outputs": [], "source": [ "df.applymap(lambda x: '%.2f' % x)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T20:22:14.494758Z", "start_time": "2018-10-23T20:22:14.483435Z" } }, "outputs": [], "source": [ "df['A'].map(lambda x: x**2)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Histogramas" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T20:24:22.159518Z", "start_time": "2018-10-23T20:24:22.144119Z" }, "scrolled": true }, "outputs": [], "source": [ "s = pd.Series(np.random.randint(0,7,size=100))\n", "s" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T20:24:22.949473Z", "start_time": "2018-10-23T20:24:22.938232Z" } }, "outputs": [], "source": [ "s.value_counts()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Métodos strings\n", "\n", "Las series están equipadas con un conjunto de métodos de procesamiento de strings, en el atributo 'str' que hace fácil de operar en cada elemento del arreglo, como en el código que sigue.\n", "\n", "Notemos que el 'pattern-matching' en 'str' usa 'expresiones regulares' por defecto" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T20:25:01.800803Z", "start_time": "2018-10-23T20:25:01.790985Z" } }, "outputs": [], "source": [ "s = pd.Series(['A','B','C','Aaba','Baca', np.nan, 'CABA', 'dog','cat'])\n", "s" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T20:25:46.746610Z", "start_time": "2018-10-23T20:25:46.735218Z" } }, "outputs": [], "source": [ "s.str.upper()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Merge\n", "\n", "### Concatenar\n", "\n", "pandas provee varias facilidades para combinar juntos objetos Series, DataFrames y Panel con varios objetos de conjuntos lógicos para indexar y relacionar en el caso de operaciones tipo 'join/merge'\n", "\n", "Concatenando objetos juntos con 'concat()'" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T20:26:08.392288Z", "start_time": "2018-10-23T20:26:08.374251Z" } }, "outputs": [], "source": [ "df = pd.DataFrame(np.random.randn(10, 4))\n", "display(df)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T20:26:29.215644Z", "start_time": "2018-10-23T20:26:29.211580Z" } }, "outputs": [], "source": [ "pieces = [df[7:], df[3:7], df[:3]]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T20:26:30.201097Z", "start_time": "2018-10-23T20:26:30.184782Z" } }, "outputs": [], "source": [ "pieces[0]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T20:26:33.503061Z", "start_time": "2018-10-23T20:26:33.485433Z" } }, "outputs": [], "source": [ "pieces[1]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T20:26:36.943107Z", "start_time": "2018-10-23T20:26:36.926705Z" } }, "outputs": [], "source": [ "pieces[2]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T20:28:10.995443Z", "start_time": "2018-10-23T20:28:10.982723Z" } }, "outputs": [], "source": [ "pd.concat?" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T20:27:12.587139Z", "start_time": "2018-10-23T20:27:12.564596Z" } }, "outputs": [], "source": [ "pd.concat(pieces).sort_index()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T20:29:16.099493Z", "start_time": "2018-10-23T20:29:16.089191Z" } }, "outputs": [], "source": [ "s1 = pd.Series(['a', 'b'])\n", "s2 = pd.Series(['c', 'd'])\n", "dff = pd.concat([s1, s2])\n", "dff" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T20:29:45.904429Z", "start_time": "2018-10-23T20:29:45.893669Z" } }, "outputs": [], "source": [ "pd.concat([s1, s2], ignore_index=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T20:29:55.770400Z", "start_time": "2018-10-23T20:29:55.756851Z" } }, "outputs": [], "source": [ "pd.concat([s1, s2], keys=['s1', 's2',])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T20:30:39.762746Z", "start_time": "2018-10-23T20:30:39.749101Z" } }, "outputs": [], "source": [ "pd.concat([s1, s2], keys=['s1', 's2'], names=['Series name', 'Row ID'])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T20:31:08.624263Z", "start_time": "2018-10-23T20:31:08.594653Z" } }, "outputs": [], "source": [ "df1 = pd.DataFrame([['a', 1], ['b', 2]], columns=['letter', 'number'])\n", "df2 = pd.DataFrame([['c', 3], ['d', 4]], columns=['letter', 'number'])\n", "df3 = pd.DataFrame([['c', 3, 'cat'], ['d', 4, 'dog']], columns=['letter', 'number', 'animal'])\n", "df4 = pd.DataFrame([['bird', 'polly'], ['monkey', 'george']], columns=['animal', 'name'])\n", "display(df1, df2, df3, df4)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T20:32:09.057007Z", "start_time": "2018-10-23T20:32:09.044107Z" } }, "outputs": [], "source": [ "pd.concat([df1, df2])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T20:32:23.192393Z", "start_time": "2018-10-23T20:32:23.175866Z" } }, "outputs": [], "source": [ "pd.concat([df1, df3])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T20:32:52.741834Z", "start_time": "2018-10-23T20:32:52.728462Z" } }, "outputs": [], "source": [ "pd.concat([df1, df3], join=\"inner\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T20:34:00.323691Z", "start_time": "2018-10-23T20:34:00.314052Z" } }, "outputs": [], "source": [ "pd.concat([df1, df4], axis=1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T20:36:12.750615Z", "start_time": "2018-10-23T20:36:12.730424Z" } }, "outputs": [], "source": [ "df5 = pd.DataFrame([1], index=['a'])\n", "df6 = pd.DataFrame([2], index=['a'])\n", "display(df5, df6)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T20:36:13.060144Z", "start_time": "2018-10-23T20:36:13.054572Z" } }, "outputs": [], "source": [ "pd.concat([df5, df6])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T20:36:13.523827Z", "start_time": "2018-10-23T20:36:13.510047Z" }, "scrolled": true }, "outputs": [], "source": [ "pd.concat([df5, df6], verify_integrity=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Juntando\n", "\n", "Merge estilo SQL:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T20:36:26.574524Z", "start_time": "2018-10-23T20:36:26.554080Z" } }, "outputs": [], "source": [ "left = pd.DataFrame({'key': ['foo','foo'], 'lval': [1,2]})\n", "right = pd.DataFrame({'key': ['foo','foo'], 'rval': [4,5]})\n", "display(left)\n", "display(right)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T20:37:04.313956Z", "start_time": "2018-10-23T20:37:04.297096Z" } }, "outputs": [], "source": [ "pd.merge(left, right, on='key')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T20:37:43.804164Z", "start_time": "2018-10-23T20:37:43.790201Z" } }, "outputs": [], "source": [ "pd.merge(left, right, on='key', how='right')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T20:38:24.746745Z", "start_time": "2018-10-23T20:38:24.731049Z" } }, "outputs": [], "source": [ "pd.merge(right, left, on='key')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Otro ejemplo que puede ser dado es:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T20:39:34.359164Z", "start_time": "2018-10-23T20:39:34.336600Z" } }, "outputs": [], "source": [ "left = pd.DataFrame({'key': ['foo', 'bar', 'foo', 'bar'], 'lval': [0, 1, 2, 3]})\n", "right = pd.DataFrame({'key': ['foo', 'bar', 'bar'], 'rval': [4, 5, 6]})\n", "display(left)\n", "display(right)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T20:39:53.676722Z", "start_time": "2018-10-23T20:39:53.659895Z" } }, "outputs": [], "source": [ "pd.merge(left, right, on='key')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T20:40:33.682692Z", "start_time": "2018-10-23T20:40:33.666261Z" } }, "outputs": [], "source": [ "pd.merge(left, right, on='key', how='left')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Join estilo SQL" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T20:41:05.387323Z", "start_time": "2018-10-23T20:41:05.371935Z" } }, "outputs": [], "source": [ "caller = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3', 'K4', 'K5'], \n", " 'A': ['A0', 'A1', 'A2', 'A3', 'A4', 'A5']})\n", "caller" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T20:41:06.650800Z", "start_time": "2018-10-23T20:41:06.638862Z" } }, "outputs": [], "source": [ "other = pd.DataFrame({'key': ['K0', 'K1', 'K2'], \n", " 'B': ['B0', 'B1', 'B2']})\n", "other" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T20:41:44.028425Z", "start_time": "2018-10-23T20:41:44.012197Z" } }, "outputs": [], "source": [ "caller.join(other, lsuffix='_caller', rsuffix='_other')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T20:42:45.640217Z", "start_time": "2018-10-23T20:42:45.626781Z" } }, "outputs": [], "source": [ "caller" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T20:42:56.077159Z", "start_time": "2018-10-23T20:42:56.060742Z" } }, "outputs": [], "source": [ "caller.set_index('key')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T20:43:13.410265Z", "start_time": "2018-10-23T20:43:13.396771Z" } }, "outputs": [], "source": [ "caller.set_index('key').join(other.set_index('key'))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-23T20:43:34.275504Z", "start_time": "2018-10-23T20:43:34.259341Z" } }, "outputs": [], "source": [ "caller.join(other.set_index('key'), on='key')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Adjuntando\n", "\n", "Adjuntando filas a un dataframe" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T19:23:45.269402Z", "start_time": "2018-10-25T19:23:45.232041Z" } }, "outputs": [], "source": [ "df = pd.DataFrame(np.random.randn(8,4), columns=['A','B','C','D'])\n", "df" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T19:23:53.919742Z", "start_time": "2018-10-25T19:23:53.913327Z" } }, "outputs": [], "source": [ "s = df.iloc[3]\n", "s" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T19:24:00.050169Z", "start_time": "2018-10-25T19:24:00.038737Z" } }, "outputs": [], "source": [ "df.append(s, ignore_index=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T19:24:20.019890Z", "start_time": "2018-10-25T19:24:20.007930Z" } }, "outputs": [], "source": [ "new = {'A': 0, 'B': 1, 'C': 2, 'D': 3}\n", "df.append(new, ignore_index=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Agrupando\n", "\n", "Por 'group by' nos referimos a un proceso que envuelve uno o más pasos.\n", "- Dividiendo ('Splitting') los datos en grupos dado cierto criterio\n", "- Aplicando ('Applying') una función a cada grupo independientemente.\n", "- Combinando ('Combining') el resultado dentro de una estructura de datos" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T19:26:49.775389Z", "start_time": "2018-10-25T19:26:49.765425Z" }, "scrolled": false }, "outputs": [], "source": [ "df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar','foo', 'bar', 'foo', 'foo'],\n", " 'B' : ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],\n", " 'C' : np.random.randn(8),\n", " 'D' : np.random.randn(8)})\n", "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Agrupando y después aplicando la función 'sum' a los grupos resultantes" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T19:27:35.936723Z", "start_time": "2018-10-25T19:27:35.923861Z" } }, "outputs": [], "source": [ "df.groupby('A').sum()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Agrupando por multiples columnas forma un índice jerárquico, al cual se le aplica la función" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T19:28:37.053400Z", "start_time": "2018-10-25T19:28:37.040639Z" } }, "outputs": [], "source": [ "df.groupby(['A','B']).sum()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Reshaping" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T19:31:21.927649Z", "start_time": "2018-10-25T19:31:21.922255Z" } }, "outputs": [], "source": [ "tuples = list(zip(*[['bar', 'bar', 'baz', 'baz',\n", " 'foo', 'foo', 'qux', 'qux'],\n", " ['one', 'two', 'one', 'two',\n", " 'one', 'two', 'one', 'two']]))\n", "tuples" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T19:31:34.695439Z", "start_time": "2018-10-25T19:31:34.689329Z" } }, "outputs": [], "source": [ "index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second'])\n", "index" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T19:32:20.809447Z", "start_time": "2018-10-25T19:32:20.798498Z" } }, "outputs": [], "source": [ "df = pd.DataFrame(np.random.randn(8, 2), index=index, columns=['A', 'B'])\n", "df" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T19:32:49.839784Z", "start_time": "2018-10-25T19:32:49.827865Z" } }, "outputs": [], "source": [ "df.T" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T19:32:56.456275Z", "start_time": "2018-10-25T19:32:56.448751Z" } }, "outputs": [], "source": [ "df2 = df[:4]\n", "df2" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "El método 'stack()' comprime un nivel en las columnas del dataframe" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T19:33:49.191894Z", "start_time": "2018-10-25T19:33:49.186008Z" } }, "outputs": [], "source": [ "stacked = df2.stack()\n", "stacked" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Con un dataframe 'stacked', la operación inversa de 'stack' es 'unstack', el cual por default desapila el último nivel:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T19:34:23.069749Z", "start_time": "2018-10-25T19:34:23.045850Z" } }, "outputs": [], "source": [ "stacked.unstack()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T19:34:38.789067Z", "start_time": "2018-10-25T19:34:38.777751Z" } }, "outputs": [], "source": [ "stacked.unstack(1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T19:34:56.998586Z", "start_time": "2018-10-25T19:34:56.989563Z" } }, "outputs": [], "source": [ "stacked.unstack(0)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T19:35:23.220853Z", "start_time": "2018-10-25T19:35:23.208198Z" } }, "outputs": [], "source": [ "stacked.unstack([0,1])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Pivoteando tablas" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T19:37:40.200816Z", "start_time": "2018-10-25T19:37:40.187420Z" } }, "outputs": [], "source": [ "df = pd.DataFrame({'A' : ['one', 'one', 'two', 'three'] * 3,\n", " 'B' : ['A', 'B', 'C'] * 4,\n", " 'C' : ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 2,\n", " 'D' : np.random.randn(12),\n", " 'E' : np.random.randn(12)})\n", "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Se puede pivotear tables desde estos datos fácilmente:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T19:38:43.749382Z", "start_time": "2018-10-25T19:38:43.730313Z" } }, "outputs": [], "source": [ "pd.pivot_table(df, values='D', index=['A', 'B'], columns=['C'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "melt corresponde a la operación inversa " ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T19:39:59.918996Z", "start_time": "2018-10-25T19:39:59.901641Z" } }, "outputs": [], "source": [ "df" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T19:40:46.156461Z", "start_time": "2018-10-25T19:40:46.132625Z" }, "scrolled": true }, "outputs": [], "source": [ "pd.melt(df)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T19:41:50.491303Z", "start_time": "2018-10-25T19:41:50.475506Z" }, "scrolled": true }, "outputs": [], "source": [ "pd.melt(df, id_vars=['A', 'B', 'C', 'D'], value_vars=['E'])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T19:42:16.693550Z", "start_time": "2018-10-25T19:42:16.677305Z" }, "scrolled": true }, "outputs": [], "source": [ "pd.melt(df, id_vars=['A', 'B', 'C'], value_vars=['D', 'E'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Series de tiempo\n", "\n", "panda tiene un simple, poderoso y eficiente funcionalidad y desempeño al realizar operaciones de re-sampleo durante conversión de frecuencia (ejemplo, convirtiendo datos en segundos a datos cada 5 minutos). " ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T19:44:16.231673Z", "start_time": "2018-10-25T19:44:16.221004Z" }, "scrolled": true }, "outputs": [], "source": [ "rng = pd.date_range('1/1/2012', periods=100, freq='S')\n", "ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng)\n", "ts" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T19:44:44.560154Z", "start_time": "2018-10-25T19:44:44.553199Z" } }, "outputs": [], "source": [ "ts.index[1:] - ts.index[:-1]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T19:45:02.142871Z", "start_time": "2018-10-25T19:45:01.981824Z" }, "scrolled": true }, "outputs": [], "source": [ "ts.plot()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T19:46:59.514984Z", "start_time": "2018-10-25T19:46:59.509043Z" } }, "outputs": [], "source": [ "aux= ts.resample('10s')\n", "aux.sum()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Representación en zona de tiempo (Time zone)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T19:48:30.703061Z", "start_time": "2018-10-25T19:48:30.691002Z" } }, "outputs": [], "source": [ "rng = pd.date_range('3/6/2012 00:00', periods=5, freq='D')\n", "ts = pd.Series(np.random.randn(len(rng)), rng)\n", "display(ts)\n", "ts_utc = ts.tz_localize('UTC')\n", "ts_utc" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Convirtiendo a otra zona horaria:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T19:48:37.083601Z", "start_time": "2018-10-25T19:48:37.054944Z" } }, "outputs": [], "source": [ "ts_utc.tz_convert('US/Eastern')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Convirtiendo entre representaciones de lapso de tiempo " ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T19:49:09.601777Z", "start_time": "2018-10-25T19:49:09.574079Z" } }, "outputs": [], "source": [ "rng = pd.date_range('1/1/2012', periods=5, freq='M')\n", "ts = pd.Series(np.random.randn(len(rng)), index=rng)\n", "display(ts)\n", "ps = ts.to_period()\n", "display(ps)\n", "ps.to_timestamp()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Convirtiendo entre periodos y lapsos de tiempo permite usar ciertas funciones aritméticas convenientes. EN el siguiente ejemplo, se convierte una frecuencia trimestral con fin de año en noviembre a las 9am de el fin del mes siguiendo el fin del trimestre." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T19:49:54.601402Z", "start_time": "2018-10-25T19:49:54.583759Z" } }, "outputs": [], "source": [ "prng = pd.period_range('1990Q1', '2000Q4', freq='Q-NOV')\n", "ts = pd.Series(np.random.randn(len(prng)), prng)\n", "ts" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T19:50:06.333433Z", "start_time": "2018-10-25T19:50:06.328506Z" } }, "outputs": [], "source": [ "ts.index = (prng.asfreq('M', 'e') + 1).asfreq('H', 's') + 9\n", "ts.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T19:51:57.973527Z", "start_time": "2018-10-25T19:51:57.967637Z" }, "scrolled": true }, "outputs": [], "source": [ "rng1 = pd.date_range('1/1/2012', periods=5, freq='S')\n", "rng2 =pd.date_range('1/1/2012', periods=5, freq='M')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T19:52:02.445356Z", "start_time": "2018-10-25T19:52:02.439568Z" } }, "outputs": [], "source": [ "rng1" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T19:52:05.155452Z", "start_time": "2018-10-25T19:52:05.149891Z" } }, "outputs": [], "source": [ "rng2" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T19:52:13.075039Z", "start_time": "2018-10-25T19:52:13.068779Z" } }, "outputs": [], "source": [ "rng1 - rng2" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Categórico (Categorical)\n", "Desde la versión 0.15 pandas incluye datos categoricos en DataFrame." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T19:53:56.594853Z", "start_time": "2018-10-25T19:53:56.589323Z" } }, "outputs": [], "source": [ "df = pd.DataFrame({\"id\":[1,2,3,4,5,6], \"raw_grade\":['a', 'b', 'b', 'a', 'a', 'e']})\n", "df[\"raw_grade\"]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Convirtiendo los grados en crudo a datos de tipo categórico" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T19:54:24.103037Z", "start_time": "2018-10-25T19:54:24.090407Z" } }, "outputs": [], "source": [ "df[\"grade\"] = df[\"raw_grade\"].astype(\"category\")\n", "df[\"grade\"]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Cambiar el nombre de las categorías a nombres con más sentido (asignar a 'series.cat.categories' está en su lugar!)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T19:54:34.375133Z", "start_time": "2018-10-25T19:54:34.370590Z" } }, "outputs": [], "source": [ "df[\"grade\"].cat.categories = [\"very good\", \"good\", \"very bad\"]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T19:54:34.948357Z", "start_time": "2018-10-25T19:54:34.939827Z" } }, "outputs": [], "source": [ "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Reordenar las categorias y simultáneamente añadir las categorías faltantes (métodos bajo 'Series.cat' devuelven una nueva 'serie' por default)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T19:54:48.555983Z", "start_time": "2018-10-25T19:54:48.548337Z" } }, "outputs": [], "source": [ "df[\"grade\"] = df[\"grade\"].cat.set_categories([\"very bad\", \"bad\", \"medium\", \"good\", \"very good\"])\n", "df[\"grade\"]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Al ordenar se hace por orden en las categorías, no por orden léxico:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T19:55:00.757154Z", "start_time": "2018-10-25T19:55:00.749099Z" } }, "outputs": [], "source": [ "df.sort_values(by=\"grade\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Agrupando por una columna categórica muestra también categorías vacías:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T19:55:04.693985Z", "start_time": "2018-10-25T19:55:04.685272Z" } }, "outputs": [], "source": [ "df.groupby(\"grade\").size()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Graficando" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T19:55:51.775678Z", "start_time": "2018-10-25T19:55:51.619139Z" } }, "outputs": [], "source": [ "ts = pd.Series(np.random.randn(1000), index=pd.date_range('1/1/2000', periods=1000))\n", "\n", "ts = ts.cumsum()\n", "ts.plot()\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "En un 'DataFrame', 'plot()' es una conveniencia para graficar todas las columnas con etiquetas: " ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T19:56:09.663160Z", "start_time": "2018-10-25T19:56:09.432816Z" } }, "outputs": [], "source": [ "df = pd.DataFrame(np.random.randn(1000, 4), index=ts.index,\n", " columns=['A', 'B', 'C', 'D'])\n", "df = df.cumsum()\n", "\n", "df.plot()\n", "plt.legend(loc='best')\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T19:56:25.538044Z", "start_time": "2018-10-25T19:56:25.334316Z" } }, "outputs": [], "source": [ "df[['A','B']].plot()\n", "plt.legend()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T19:57:51.751012Z", "start_time": "2018-10-25T19:57:51.521361Z" } }, "outputs": [], "source": [ "df.plot()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T19:57:47.175285Z", "start_time": "2018-10-25T19:57:47.037567Z" } }, "outputs": [], "source": [ "plt.plot(df)\n", "plt.tight_layout()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Obteniendo datos - Exportando datos" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### CSV\n", "Escribiendo un archivo CSV:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T19:59:04.835589Z", "start_time": "2018-10-25T19:59:04.819868Z" } }, "outputs": [], "source": [ "df.to_csv('nombrearchivo.csv')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Leyendo desde un archivo CSV:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T19:59:13.868229Z", "start_time": "2018-10-25T19:59:13.847729Z" }, "scrolled": true }, "outputs": [], "source": [ "pd.read_csv('nombrearchivo.csv', index_col=0)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### HDF5\n", "Leyendo y escribiendo HDFStores:\n", "\n", "Escribiendo en un HDF5:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T19:59:36.724346Z", "start_time": "2018-10-25T19:59:36.312154Z" } }, "outputs": [], "source": [ "df.to_hdf('nombrearchivo.h5','df')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Leyendo desde un HDF5:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T19:59:38.067639Z", "start_time": "2018-10-25T19:59:38.034147Z" }, "scrolled": true }, "outputs": [], "source": [ "pd.read_hdf('nombrearchivo.h5','df')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Excel\n", "\n", "Escribiendo un archivo Excel:\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T20:00:28.779400Z", "start_time": "2018-10-25T20:00:28.637529Z" }, "scrolled": true }, "outputs": [], "source": [ "df.to_excel('foo.xlsx', sheet_name='Sheet1')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Leyendo un archivo Excel:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T20:00:32.428322Z", "start_time": "2018-10-25T20:00:32.352081Z" }, "scrolled": true }, "outputs": [], "source": [ "pd.read_excel('foo.xlsx', 'Sheet1', index_col=None, na_values=['NA'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Pandas Chapters\n", "\n", "## Capítulo 1: leyendo datos desde un CSV\n", "\n", "En este capítulo se leerán datos de un archivo CSV usando la función 'read_csv' de pandas. Por defecto, la función asume que los datos vienen separados por comas.\n", "\n", "Vamos a mirar datos ciclistas de Montreal, y se pueden ver los datos usados en la página http://donnees.ville.montreal.qc.ca/dataset/velos-comptage\n", "\n", "En este dataset se muestra cuánta gente estuvo en 7 diferentes ciclovias en Montreal, cada día." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T20:02:01.898517Z", "start_time": "2018-10-25T20:02:01.893069Z" } }, "outputs": [], "source": [ "## CHAPTER 1\n", "%matplotlib inline\n", "\n", "import pandas as pd" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Intentamos leer la tabla:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T20:02:05.446685Z", "start_time": "2018-10-25T20:02:05.412832Z" } }, "outputs": [], "source": [ "broken_df = pd.read_csv('bikes.csv')\n", "broken_df[:3]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Está roto el archivo, así que para arreglar el problema, se hará:\n", " - Cambiar el separador de columnas a ';'\n", " - Cambiar el encoding a 'latin1' (por defecto es 'utf8')\n", " - Asignar las fechas a la columna 'Dates'\n", " - Especificar que nuestras fechas tienen el día primero en vez del mes\n", " - Dejar el índice en las columnas 'Dates'" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T20:03:13.804386Z", "start_time": "2018-10-25T20:03:13.746619Z" } }, "outputs": [], "source": [ "fixed_df = pd.read_csv('bikes.csv', sep=';', encoding='latin1', parse_dates=['Date'], dayfirst=True, index_col='Date')\n", "fixed_df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Seleccionando una columna\n", "\n", "Cuando se lee un CSV se tiene un objeto que se denomina 'DataFrame', el cual está hecho de filas y columnas. Puedes obtener las columnas a partir del 'DataFrame' de la misma forma en la que se obtienen los elementos de un diccionario:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T20:03:57.990417Z", "start_time": "2018-10-25T20:03:57.978145Z" }, "scrolled": true }, "outputs": [], "source": [ "fixed_df['Berri 1']" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Graficando una columa\n", "\n", "Sólo se debe añadir la función '.plot()' al final!\n", "\n", "Así se ve, de una forma no sorprendente, que la gente no usa mucho la bicicleta durante Enero, Febrero y Marzo." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T20:04:05.020570Z", "start_time": "2018-10-25T20:04:04.769936Z" } }, "outputs": [], "source": [ "fixed_df['Berri 1'].plot()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "También se pueden plotear todas las columnas igual de fácil, y se puede configurar para que la figura sea más grande. Al estar juntos se ven más apretados, pero todas las ciclovias tienen la misma estructura. Si es un mal día para un ciclista, es un mal día en todas partes." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T20:04:24.960771Z", "start_time": "2018-10-25T20:04:24.523617Z" } }, "outputs": [], "source": [ "fixed_df.plot(figsize=(15,10))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Capítulo 2: \n", "\n", "Usaremos un nuevo conjunto de datos para demostrar cómo trabajar con grandes dataset. Este corresponde a un subconjunto de las peticiones de servicio del 311, obtenidos desde NYC Open Data (https://nycopendata.socrata.com/Social-Services/311-Service-Requests-from-2010-to-Present/erm2-nwe9)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T20:05:03.009803Z", "start_time": "2018-10-25T20:05:03.002337Z" } }, "outputs": [], "source": [ "import pandas as pd\n", "from IPython.display import display\n", "\n", "# Make the graphs a bit prettier, and bigger\n", "pd.set_option('display.width', 5000) \n", "pd.set_option('display.max_columns', 60) " ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T20:05:09.151301Z", "start_time": "2018-10-25T20:05:07.967459Z" }, "scrolled": true }, "outputs": [], "source": [ "complaints = pd.read_csv('311-service-requests.csv')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Qué hay en ella? (el resumen)\n", "\n", "Cuando se mira un gran dataframe, en vez de mostrar el contenido del dataframe, se mostrará un resumen (summary). Este incluye todas las columas y cuántos valores no nulos están en cada columna\n", "\n", "¿SI?" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T20:05:21.919521Z", "start_time": "2018-10-25T20:05:21.797781Z" } }, "outputs": [], "source": [ "complaints" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Seleccionando filas y columnas\n", "\n", "Para seleccionar una columna, se indexa con el nombre de la columna, como sigue:\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T20:05:44.386147Z", "start_time": "2018-10-25T20:05:44.377577Z" } }, "outputs": [], "source": [ "complaints['Complaint Type']" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Para obtener las 5 primeras columas, podemos usar una tajada como 'df[:5].\n", "Esta es una gran manera para tener un sentido de qué tipo de información está en el 'dataframe'. " ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T20:06:09.022336Z", "start_time": "2018-10-25T20:06:08.980389Z" } }, "outputs": [], "source": [ "complaints[:5]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Y se puede combinar para obtener las cinco primeras filas de una columna" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T20:06:11.723991Z", "start_time": "2018-10-25T20:06:11.717426Z" } }, "outputs": [], "source": [ "complaints['Complaint Type'][:5]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Y no importa en qué orden se realiza:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T20:06:15.543566Z", "start_time": "2018-10-25T20:06:15.538373Z" } }, "outputs": [], "source": [ "complaints[:5]['Complaint Type']" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Seleccionando multiples columnas\n", "\n", "Qué si queremos saber el tipo de queja (complaint) y la ciudad (borough), pero no el resto de la información?\n", "Pandas hace esto realmente fácil al seleccionar un subconjunto de columnas, sólo se indexa con una lista de las columnas que se quieren:\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T20:06:27.355569Z", "start_time": "2018-10-25T20:06:27.338919Z" } }, "outputs": [], "source": [ "complaints[['Complaint Type', 'Borough']]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Cuál es la queja más común?\n", "\n", "Esta es una pregunta realmente fácil de resolver. Hay una método '.value_counts()' que podemos usar:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T20:06:47.707287Z", "start_time": "2018-10-25T20:06:47.685796Z" }, "scrolled": true }, "outputs": [], "source": [ "complaints['Complaint Type'].value_counts()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Y si solo queremos el top 10 de las quejas más comunes:\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T20:07:19.212151Z", "start_time": "2018-10-25T20:07:19.192813Z" } }, "outputs": [], "source": [ "complaint_counts = complaints['Complaint Type'].value_counts()\n", "complaint_counts[:10]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Y podemos graficarlos!!" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T20:07:27.130346Z", "start_time": "2018-10-25T20:07:26.852905Z" } }, "outputs": [], "source": [ "complaint_counts[:10].plot(kind='bar', figsize=(15,10))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Capitulo 3\n", "\n", "Continuaremos con el dataframe usado en el capítulo anterior\n", "\n", "### Seleccionando sólo las quejas por ruido (noise complaints)\n", "\n", "Se quiere saber qué ciudad (borough) tiene más quejas por ruido. En primer lugar, vamos a tomar una mirada de la data para ver cómo se ve" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T20:07:47.776372Z", "start_time": "2018-10-25T20:07:47.734073Z" } }, "outputs": [], "source": [ "complaints[:5]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Para obtener las quejas por ruido, necesitamos encontrar las filas donde la columna 'Complaint type' es 'Noise - Street/Sidewalk'. Vamos a ver cómo hacer eso en el ejemplo que sigue:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T20:07:56.147795Z", "start_time": "2018-10-25T20:07:56.095533Z" } }, "outputs": [], "source": [ "noise_complaints = complaints[complaints['Complaint Type'] == 'Noise - Street/Sidewalk']\n", "noise_complaints[:5]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Si se mira a 'noise_complaints' se ve que funcionó y que sólo contiene quejas de ese tipo. Pero cómo funciona?\n", "\n", "Vamos a deconstuirlo en dos partes:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T20:08:06.897121Z", "start_time": "2018-10-25T20:08:06.882998Z" } }, "outputs": [], "source": [ "complaints['Complaint Type'] == \"Noise - Street/Sidewalk\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Este es un gran arreglo de Trues y Falses, uno por cada fila de nuestro dataframe. Cuando nosotros indexamos este dataframe con este arreglo, se obtienen solo las columnas donde hay Trues.\n", "\n", "Se puede también combinar una o más condiciones usando el operador '&' como sigue:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T20:08:52.222469Z", "start_time": "2018-10-25T20:08:52.168784Z" } }, "outputs": [], "source": [ "# Combinando más de una condición\n", "is_noise = complaints['Complaint Type'] == \"Noise - Street/Sidewalk\"\n", "in_brooklyn = complaints['Borough'] == \"BROOKLYN\"\n", "complaints[is_noise & in_brooklyn][:5]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "O si solo se quieren unas pocas columnas:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T20:09:11.955373Z", "start_time": "2018-10-25T20:09:11.942045Z" } }, "outputs": [], "source": [ "complaints[is_noise & in_brooklyn][['Complaint Type', 'Borough', 'Created Date', 'Descriptor']][:10]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Una desviación con respecto a los arreglos de Numpy\n", "\n", "Por otro lado, para pandas una columna es del tipo 'pd.Series'" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T20:09:34.010413Z", "start_time": "2018-10-25T20:09:34.005446Z" } }, "outputs": [], "source": [ "pd.Series([1,2,3])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "E internamente las series de pandas son arreglos de Numpy. Si se añade '.values' al final de cualquier 'Serie', se obtiene el arreglo de numpy interno" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T20:09:37.541754Z", "start_time": "2018-10-25T20:09:37.536918Z" } }, "outputs": [], "source": [ "import numpy as np\n", "np.array([1,2,3])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T20:09:40.424092Z", "start_time": "2018-10-25T20:09:40.418139Z" } }, "outputs": [], "source": [ "pd.Series([1,2,3]).values" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Entonces, esta selección de arreglos binarios es actualmente algo que funciona con cualquier arreglo Numpy" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T20:09:47.295669Z", "start_time": "2018-10-25T20:09:47.290806Z" } }, "outputs": [], "source": [ "arr = np.array([1,2,3])\n", "arr != 2" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T20:09:48.680366Z", "start_time": "2018-10-25T20:09:48.676611Z" } }, "outputs": [], "source": [ "arr[arr!=2]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Entonces, qué ciudad tiene la mayor cantidad de quejas por ruido?\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T20:10:20.633402Z", "start_time": "2018-10-25T20:10:20.616567Z" } }, "outputs": [], "source": [ "is_noise = complaints['Complaint Type'] == 'Noise - Street/Sidewalk'\n", "noise_complaints = complaints[is_noise]\n", "noise_complaints['Borough'].value_counts()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Es Manhattan! pero qué si lo que queremos es dividir por el total de quejas para hacer que tenga un poco más de sentido? Eso podría ser fácil también:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T20:11:57.445926Z", "start_time": "2018-10-25T20:11:57.424121Z" } }, "outputs": [], "source": [ "noise_complaints_counts = noise_complaints['Borough'].value_counts()\n", "complaint_counts = complaints['Borough'].value_counts()\n", "noise_complaints_counts/complaint_counts" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Y graficando:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T20:12:10.041405Z", "start_time": "2018-10-25T20:12:09.935037Z" } }, "outputs": [], "source": [ "(noise_complaints_counts / complaint_counts).plot(kind='bar')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Es evidente que Manhattan tiene la mayor cantidad de quejas que otros vecindarios." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Capítulo 4\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T20:15:22.685759Z", "start_time": "2018-10-25T20:15:22.629272Z" } }, "outputs": [], "source": [ "bikes = pd.read_csv('bikes.csv', sep=';', encoding='latin1', parse_dates=['Date'], dayfirst=True, index_col='Date')\n", "display(bikes)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T20:15:23.259187Z", "start_time": "2018-10-25T20:15:22.980160Z" } }, "outputs": [], "source": [ "bikes['Berri 1'].plot()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T20:15:52.246146Z", "start_time": "2018-10-25T20:15:52.229721Z" }, "scrolled": true }, "outputs": [], "source": [ "berri_bikes = bikes[['Berri 1']]\n", "berri_bikes2 = bikes['Berri 1']\n", "display(berri_bikes)\n", "display(berri_bikes2)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T20:16:01.136663Z", "start_time": "2018-10-25T20:16:01.125415Z" }, "scrolled": true }, "outputs": [], "source": [ "display(berri_bikes.index.day)\n", "berri_bikes.index.weekday" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T20:17:18.868453Z", "start_time": "2018-10-25T20:17:18.823958Z" } }, "outputs": [], "source": [ "berri_bikes['weekday']=berri_bikes.index.weekday\n", "berri_bikes[:5]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T20:17:28.547930Z", "start_time": "2018-10-25T20:17:28.459648Z" } }, "outputs": [], "source": [ "berri_bikes.loc[:,1]=berri_bikes.index.weekday\n", "berri_bikes" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T20:17:50.282444Z", "start_time": "2018-10-25T20:17:50.269095Z" }, "scrolled": true }, "outputs": [], "source": [ "berri_bikes = berri_bikes.drop([1], axis=1) #berri_bikes.drop([1], axis=1, inplace=True)\n", "display(berri_bikes)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T20:18:07.732466Z", "start_time": "2018-10-25T20:18:07.715891Z" }, "scrolled": true }, "outputs": [], "source": [ "berri_bikes.drop(['weekday'], axis=1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T20:18:10.389515Z", "start_time": "2018-10-25T20:18:10.377606Z" }, "scrolled": true }, "outputs": [], "source": [ "berri_bikes" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T20:18:39.318177Z", "start_time": "2018-10-25T20:18:39.306187Z" } }, "outputs": [], "source": [ "weekday_count = berri_bikes.groupby('weekday').aggregate(sum)\n", "weekday_count.index = ['M','T','W','T','F','S','S']\n", "weekday_count" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T20:18:49.519944Z", "start_time": "2018-10-25T20:18:49.395967Z" } }, "outputs": [], "source": [ "weekday_count.plot(kind='bar')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Capítulo 5\n", "\n", "Ya hemos visto que Pandas se maneja bien con fechas, y también es asombroso trabajando con strings!\n", "Vamos a trabajar con datos climáticos de Montreal en invierno (sí, es frío!!).\n", "Panda tiene todas las operaciones vectorizadas de strings, y vamos a transformar un montón de strings que contienen la palabra 'snow' en vectores de números en un trice." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T20:19:16.218126Z", "start_time": "2018-10-25T20:19:16.215561Z" }, "scrolled": true }, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import matplotlib as plt\n", "from IPython.display import display" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Cargamos los datos" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T20:19:18.634384Z", "start_time": "2018-10-25T20:19:18.603065Z" } }, "outputs": [], "source": [ "# chapter 6\n", "\n", "weather_2012 = pd.read_csv('weather_2012.csv', parse_dates=True, index_col='Date/Time')\n", "weather_2012[:5]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Operaciones strings\n", "\n", "Se verá que la columna 'Weather' tiene una descripción en texto del clima que estaba ocurriendo en cada hora. Vamos a asumir que está nevando si la descripción contiene 'snow'\n", "\n", "pandas provee funciones strings vectorizadas, para hacer fácil el operar en columnas que contienen texto. Hay muchos ejemplos en la documentación http://pandas.pydata.org/pandas-docs/stable/basics.html#vectorized-string-methods" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T20:20:07.954105Z", "start_time": "2018-10-25T20:20:07.941236Z" } }, "outputs": [], "source": [ "weather_description = weather_2012['Weather']\n", "is_snowing = weather_description.str.contains('Snow')\n", "is_snowing" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "No es muy útil, ya que es un vector binario y no es fácil mirarlo" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T20:20:16.247633Z", "start_time": "2018-10-25T20:20:16.239914Z" } }, "outputs": [], "source": [ "# No muy útil\n", "is_snowing.head() " ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T20:20:22.481746Z", "start_time": "2018-10-25T20:20:22.464310Z" }, "scrolled": true }, "outputs": [], "source": [ "is_snowing.plot(style='ro', figsize=(15,10))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T20:20:49.865916Z", "start_time": "2018-10-25T20:20:49.525958Z" } }, "outputs": [], "source": [ "is_snowing.astype(int).plot(style='ro', ms=0.5, figsize=(15,10))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Usando resample para encontrar el mes más nevoso\n", "\n", "Si queremos que la temperatura media de cada mes, podemos usar el método 'resample()' de la siguiente forma:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T20:22:08.430693Z", "start_time": "2018-10-25T20:22:08.228819Z" } }, "outputs": [], "source": [ "weather_2012['Temp (C)'].resample('M', how=np.median).plot(kind='bar', figsize=(15,10))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Como era de esperarse, Julio y Agosto son los más calientitos (hemisferio norte!!)\n", "\n", "Entonces podemos pensar en lo nevoso como un montón de 1s y 0s en vez de 'Trues' y 'Falses':" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T20:23:06.436495Z", "start_time": "2018-10-25T20:23:06.421777Z" } }, "outputs": [], "source": [ "is_snowing.astype(float)[:10]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Y entonces usar 'resample' para encontrar el porcentaje de tiempo que estuvo nevando en cada mes" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T20:23:17.192374Z", "start_time": "2018-10-25T20:23:17.185126Z" } }, "outputs": [], "source": [ "is_snowing.astype(float).resample('M', how=np.mean)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T20:23:31.727100Z", "start_time": "2018-10-25T20:23:31.582358Z" } }, "outputs": [], "source": [ "is_snowing.astype(float).resample('M', how=np.sum).plot(kind='bar')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Ahora sabemos! En diciembre del 2012 fue el mes más nevoso. También, este gráfico sufiere que la nieve empieza abrupta en noviembre, y se desvanece lentamente en un largo tiempo, con las últimas nieves en Abril o Mayo." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Ploteando temperatura y nevosidad juntas\n", "\n", "También se poueden combinar los dos estadíticos (temperatura y nieve) dentro del dataframe y plotearlas juntas" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T20:24:20.777266Z", "start_time": "2018-10-25T20:24:20.764943Z" } }, "outputs": [], "source": [ "temperature = weather_2012['Temp (C)'].resample('M', how=np.median)\n", "is_snowing = weather_2012['Weather'].str.contains('Snow')\n", "snowiness = is_snowing.astype(float).resample('M', how=np.mean)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T20:24:30.716618Z", "start_time": "2018-10-25T20:24:30.711205Z" } }, "outputs": [], "source": [ "# Nombramos las columnas\n", "temperature.name = 'Temperature'\n", "snowiness.name = 'Snowiness'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Vamos a usar 'concat' nuevamente para combinar los dos estadísticos dentro de un solo dataframe:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T20:24:37.421668Z", "start_time": "2018-10-25T20:24:37.410740Z" } }, "outputs": [], "source": [ "stats = pd.concat([temperature, snowiness],axis=1)\n", "stats" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T20:24:43.870972Z", "start_time": "2018-10-25T20:24:43.618780Z" } }, "outputs": [], "source": [ "stats.plot(kind='bar', figsize=(15,10))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Lo que no funcuiona tan bien porque las escalas están mal. Para solucionar esto, lo plotearemos en dos gráficos separados:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T20:24:58.431543Z", "start_time": "2018-10-25T20:24:58.079315Z" }, "scrolled": false }, "outputs": [], "source": [ "stats.plot(kind='bar',subplots=True,figsize=(15,10))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Capítulo 6\n", "\n", "### Limpiando datos sucios nunca es un juego, pero con pandas es más fácil\n", "\n", "Uno de los problemas principales con datos sucios es: cómo sabes si está sucio o si no lo está.\n", "\n", "Vamos a usar el conjunto de datos de solicitudes del servicio NYC 311, dado que es grande y pesado" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T20:25:42.045555Z", "start_time": "2018-10-25T20:25:42.039033Z" } }, "outputs": [], "source": [ "# CHAPTER 7\n", "import pandas as pd\n", "\n", "# Make the graphs a bit prettier, and bigger\n", "# Always display all the columns\n", "pd.set_option('display.width', 5000) \n", "pd.set_option('display.max_columns', 60) \n", "\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T20:25:48.438336Z", "start_time": "2018-10-25T20:25:47.325529Z" } }, "outputs": [], "source": [ "requests = pd.read_csv('311-service-requests.csv')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Vamos a mirar unas pocas columnas aquí. Ya se sabe desde antes que hay algunos problemas con el 'zip code', así que vamos a mirarlo al principio.\n", "\n", "Para tener un sentido de cuándo una columna tiene problemas, se puede usar '.unique()' para mirar todos sus valores. Si es una columna numérica, se mostrará entonces un histograma para obtener un sentido de la distribución.\n", "\n", "Cuando se mira a los valores únicos en 'Incident Zip', se vuelve claro rápidamente que es un desastre.\n", "\n", "Algunos de los problemas:\n", " - Algunos han sido asignados como strings, otros como floats\n", " - Hay algunos nans\n", " - Algunos de los codigos zip son 29616-0759 o 83\n", " - Hay algunos valores N/A que pandas no los reconoce, como 'N/A' y 'NO CLUE'\n", " \n", "Qué podemos hacer:\n", " - Normalizar 'N/A' y 'NO CLUE' en valores nan regulares\n", " - Mirar qué pasa con el 83 y decidir qué hacer\n", " - Convertir todo a string." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T20:26:00.672490Z", "start_time": "2018-10-25T20:26:00.540194Z" } }, "outputs": [], "source": [ "requests" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T20:26:10.048930Z", "start_time": "2018-10-25T20:26:10.032880Z" }, "scrolled": true }, "outputs": [], "source": [ "requests['Incident Zip'].unique()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T20:27:10.476683Z", "start_time": "2018-10-25T20:27:10.454057Z" }, "scrolled": true }, "outputs": [], "source": [ "requests.duplicated('Incident Zip')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T20:27:51.448203Z", "start_time": "2018-10-25T20:27:51.311039Z" } }, "outputs": [], "source": [ "requests.drop_duplicates('Incident Zip', keep='last')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Fijando los valores nan y solucionando la confusión string/float\n", "\n", "Vamos a pasarle la opción 'na_values' a 'pd.read_csv' para limpiar esto un poquito. También podemos especificar que el tipo 'Incident Zip' es un string, no un float." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T20:28:43.622048Z", "start_time": "2018-10-25T20:28:42.384614Z" } }, "outputs": [], "source": [ "na_values = ['NO CLUE', 'N/A', '0']\n", "requests = pd.read_csv('311-service-requests.csv', na_values=na_values, dtype={'Incident Zip': str})\n", "requests['Incident Zip'].unique()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Qué sucede con los guiones?" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T20:30:47.772867Z", "start_time": "2018-10-25T20:30:47.709297Z" }, "scrolled": true }, "outputs": [], "source": [ "requests[requests['Incident Zip'].str.contains('-').fillna(False)]['Incident Zip']" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T20:31:02.954171Z", "start_time": "2018-10-25T20:31:02.893098Z" } }, "outputs": [], "source": [ "rows_with_dashes = requests['Incident Zip'].str.contains('-').fillna(False)\n", "len(requests[rows_with_dashes])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T20:31:05.613794Z", "start_time": "2018-10-25T20:31:05.573567Z" } }, "outputs": [], "source": [ "requests[rows_with_dashes]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Una posible forma de borrar los datos faltantes podría ser de la forma:\n", "\n", "'requests['Incident Zip'][rows_with_dashes] = np.nan'\n", "\n", "Pero resulta que los códigos zips con 9 dígitos es normal. Vamos a mirar los códigos zip de más de 5 dígitos, para ver que están ok, y después truncarlos" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T20:31:45.980119Z", "start_time": "2018-10-25T20:31:45.945966Z" } }, "outputs": [], "source": [ "long_zip_codes = requests['Incident Zip'].str.len()>5\n", "requests['Incident Zip'][long_zip_codes].unique()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Los cuales lucen adecuados para truncarlos" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T20:32:17.054003Z", "start_time": "2018-10-25T20:32:17.025239Z" } }, "outputs": [], "source": [ "requests['Incident Zip'] = requests['Incident Zip'].str.slice(0,5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Hecho.\n", "\n", "Aún preocupa la presencia de los códigos de la forma '00000', así que los miraremos" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T20:32:19.208580Z", "start_time": "2018-10-25T20:32:19.166068Z" } }, "outputs": [], "source": [ "requests[requests['Incident Zip'] == '00000']" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Lo cual luce mal, vamos a setear estos zip como nan" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T20:32:27.110605Z", "start_time": "2018-10-25T20:32:27.051660Z" } }, "outputs": [], "source": [ "zero_zips = requests['Incident Zip']=='00000'\n", "requests['Incident Zip'][zero_zips] = np.nan" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Y podemos ver qué hay ahora" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T20:32:31.349903Z", "start_time": "2018-10-25T20:32:31.333621Z" }, "scrolled": true }, "outputs": [], "source": [ "unique_zips = requests['Incident Zip'].unique()\n", "unique_zips.astype(float).sort()\n", "unique_zips" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Genial!\n", "\n", "Esto es mucho más claro. Hay, sin embargo, algo un poco extraño. Después de mirar el código '77056' en googlemaps, está en texas, así que vamos a mirar más cerca:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T20:33:08.669797Z", "start_time": "2018-10-25T20:33:08.566661Z" }, "scrolled": true }, "outputs": [], "source": [ "zips = requests['Incident Zip']\n", "is_close = zips.str.startswith('0') | zips.str.startswith('1')\n", "is_far = ~(is_close.fillna(True).astype(bool))\n", "zips[is_far]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T20:33:16.537311Z", "start_time": "2018-10-25T20:33:16.512668Z" }, "scrolled": false }, "outputs": [], "source": [ "requests[is_far][['Incident Zip', 'Descriptor', 'City']].sort_values('Incident Zip')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Okey, hay solicitudes que vienen desde LA y Houston! Bien saberlo. Filtrando por zipcode es probablemente una mala forma de manejar esto, deberíamos mejor mirar la ciudad en vez." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T20:33:54.214697Z", "start_time": "2018-10-25T20:33:54.165974Z" } }, "outputs": [], "source": [ "requests['City'].str.upper().value_counts()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Luce como que hay quejas que son válidas, así que las dejaremos solas." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Poniendo todo junto" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T20:35:10.423495Z", "start_time": "2018-10-25T20:35:09.186165Z" } }, "outputs": [], "source": [ "na_values = ['NO CLUE', 'N/A', '0']\n", "requests = pd.read_csv('311-service-requests.csv', \n", " na_values=na_values, \n", " dtype={'Incident Zip': str})\n", "def fix_zip_codes(zips):\n", " # Truncate everything to length 5 \n", " zips = zips.str.slice(0, 5)\n", " \n", " # Set 00000 zip codes to nan\n", " zero_zips = zips == '00000'\n", " zips[zero_zips] = np.nan\n", " \n", " return zips" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T20:35:13.505761Z", "start_time": "2018-10-25T20:35:13.440651Z" } }, "outputs": [], "source": [ "requests['Incident Zip'] = fix_zip_codes(requests['Incident Zip'])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T20:35:14.191603Z", "start_time": "2018-10-25T20:35:14.167805Z" } }, "outputs": [], "source": [ "requests['Incident Zip'].unique()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Capítulo 7\n", "\n", "### Analizando UNIX timestamps\n", "\n", "No es obio cómo lidiar con Unix timestamps en pandas.\n", "\n", "El archivo que usaremos es un popular archivo popular para concursos, el cual está en el sistema en\n", "'/var/log/popularity-contest'.\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T20:36:29.459504Z", "start_time": "2018-10-25T20:36:29.436462Z" } }, "outputs": [], "source": [ "# CHAPTER 8\n", "\n", "popcon = pd.read_csv('popularity-contest', sep=' ', )[:-1]\n", "display(popcon[:5])\n", "popcon.columns = ['atime', 'ctime', 'package-name', 'mru-program', 'tag']\n", "popcon[:5]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T20:36:34.719732Z", "start_time": "2018-10-25T20:36:34.712622Z" }, "scrolled": true }, "outputs": [], "source": [ "popcon.atime" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "La parte máfica acerca de analizar timestamps en pandas es que los datetimes de numpy ya están guardados como Unix timestamps. Entonces todo lo que necesitamos es decirle a pandas que esos números enteros son actualmente fechas, y no se necesita hacer ninguna conversión.\n", "\n", "Vamos a convertir estos 'ints' para empezar:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T20:36:51.347132Z", "start_time": "2018-10-25T20:36:51.300524Z" } }, "outputs": [], "source": [ "popcon['atime'] = popcon['atime'].astype(int)\n", "popcon['ctime'] = popcon['ctime'].astype(int)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T20:37:00.610508Z", "start_time": "2018-10-25T20:37:00.602616Z" } }, "outputs": [], "source": [ "popcon['atime'] " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Cada arreglo numpy y serie de pandas tiene un 'dtype' -- el cual es usual 'int64', 'float64' o 'object'. Algunos de los tipos de tiempo disponibles son 'datetime64[s]', 'datetime64[ms]' y 'datetime64[us]'. Hay también tipos 'timedelta', similarmente.\n", "\n", "Podemos usar la función 'pd.to_datetime' para convertir los números enteros en fechas. Esto es una operación a tiempo constante, es decir, no estamos cambiando ninguno de los datos, sólo cómo pandas piensa acerca de él." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T20:37:26.865755Z", "start_time": "2018-10-25T20:37:26.860651Z" } }, "outputs": [], "source": [ "popcon['atime'] = pd.to_datetime(popcon['atime'], unit='s')\n", "popcon['ctime'] = pd.to_datetime(popcon['ctime'], unit='s')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Si miramos al 'dtype' ahora, es ''1970-01-01'] #Se eliminan las fechas con timestamp 0" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T20:37:57.607855Z", "start_time": "2018-10-25T20:37:57.587417Z" }, "scrolled": true }, "outputs": [], "source": [ "popcon" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Ahora podemos usar las habilidades mágicas de pandas sobre strings para mirar a las colimnas donde el nombre del paquete no contiene 'lib'." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T20:38:10.465514Z", "start_time": "2018-10-25T20:38:10.458272Z" } }, "outputs": [], "source": [ "nonlibraries = popcon[~popcon['package-name'].str.contains('lib')]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T20:38:11.965848Z", "start_time": "2018-10-25T20:38:11.943780Z" }, "scrolled": true }, "outputs": [], "source": [ "nonlibraries" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-10-25T20:38:19.399437Z", "start_time": "2018-10-25T20:38:19.378302Z" } }, "outputs": [], "source": [ "nonlibraries.sort_values('ctime',ascending=False)[:10]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python [default]", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.5" }, "toc": { "base_numbering": 1, "nav_menu": {}, "number_sections": true, "sideBar": true, "skip_h1_title": false, "title_cell": "Table of Contents", "title_sidebar": "Contents", "toc_cell": false, "toc_position": { "height": "calc(100% - 180px)", "left": "10px", "top": "150px", "width": "253px" }, "toc_section_display": true, "toc_window_display": true } }, "nbformat": 4, "nbformat_minor": 2 }