{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "\n", "%matplotlib inline\n", "import matplotlib.pyplot as plt\n", "\n", "import pandas as pd" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Exercise 1\n", "- load the dataset: `../data/international-airline-passengers.csv`\n", "- inspect it using the `.info()` and `.head()` commands\n", "- use the function `pd.to_datetime()` to change the column type of 'Month' to a datatime type\n", "- set the index of df to be a datetime index using the column 'Month' and the `df.set_index()` method\n", "- choose the appropriate plot and display the data\n", "- choose appropriate scale\n", "- label the axes" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# - load the dataset: ../data/international-airline-passengers.csv\n", "df = pd.read_csv('../data/international-airline-passengers.csv')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# - inspect it using the .info() and .head() commands\n", "df.info()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# - use the function to_datetime() to change the column type of 'Month' to a datatime type\n", "# - set the index of df to be a datetime index using the column 'Month' and tthe set_index() method\n", "\n", "df['Month'] = pd.to_datetime(df['Month'])\n", "df = df.set_index('Month')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# - choose the appropriate plot and display the data\n", "# - choose appropriate scale\n", "# - label the axes\n", "\n", "df.plot();" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Exercise 2\n", "- load the dataset: `../data/weight-height.csv`\n", "- inspect it\n", "- plot it using a scatter plot with Weight as a function of Height\n", "- plot the male and female populations with 2 different colors on a new scatter plot\n", "- remember to label the axes" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# - load the dataset: ../data/weight-height.csv\n", "# - inspect it\n", "df = pd.read_csv('../data/weight-height.csv')\n", "df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df.info()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df.describe()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "df['Gender'].value_counts()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# - plot it using a scatter plot with Weight as a function of Height\n", "_ = df.plot(kind='scatter', x='Height', y='Weight');" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# - plot the male and female populations with 2 different colors on a new scatter plot\n", "# - remember to label the axes\n", "\n", "# this can be done in several ways, showing 2 here:\n", "males = df[df['Gender'] == 'Male']\n", "females = df.query('Gender == \"Female\"')\n", "fig, ax = plt.subplots()\n", "\n", "males.plot(kind='scatter', x='Height', y='Weight',\n", " ax=ax, color='blue', alpha=0.3,\n", " title='Male & Female Populations')\n", "\n", "females.plot(kind='scatter', x='Height', y='Weight',\n", " ax=ax, color='red', alpha=0.3);" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df['Gendercolor'] = df['Gender'].map({'Male': 'blue', 'Female': 'red'})\n", "df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df.plot(kind='scatter', \n", " x='Height',\n", " y='Weight',\n", " c=df['Gendercolor'],\n", " alpha=0.3,\n", " title='Male & Female Populations');" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "fig, ax = plt.subplots()\n", "ax.plot(males['Height'], males['Weight'], 'ob', \n", " females['Height'], females['Weight'], 'or', alpha=0.3)\n", "plt.xlabel('Height')\n", "plt.ylabel('Weight')\n", "plt.title('Male & Female Populations');" ] }, { "cell_type": "markdown", "metadata": { "collapsed": true }, "source": [ "## Exercise 3\n", "- plot the histogram of the heights for males and for females on the same plot\n", "- use alpha to control transparency in the plot comand\n", "- plot a vertical line at the mean of each population using `plt.axvline()`" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "males['Height'].plot(kind='hist',\n", " bins=50,\n", " range=(50, 80),\n", " alpha=0.3,\n", " color='blue')\n", "\n", "females['Height'].plot(kind='hist',\n", " bins=50,\n", " range=(50, 80),\n", " alpha=0.3,\n", " color='red')\n", "\n", "plt.title('Height distribution')\n", "plt.legend([\"Males\", \"Females\"])\n", "plt.xlabel(\"Heigth (in)\")\n", "\n", "\n", "plt.axvline(males['Height'].mean(), color='blue', linewidth=2)\n", "plt.axvline(females['Height'].mean(), color='red', linewidth=2);" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "males['Height'].plot(kind='hist',\n", " bins=200,\n", " range=(50, 80),\n", " alpha=0.3,\n", " color='blue',\n", " cumulative=True,\n", " density=True)\n", "\n", "females['Height'].plot(kind='hist',\n", " bins=200,\n", " range=(50, 80),\n", " alpha=0.3,\n", " color='red',\n", " cumulative=True,\n", " density=True)\n", "\n", "plt.title('Height distribution')\n", "plt.legend([\"Males\", \"Females\"])\n", "plt.xlabel(\"Heigth (in)\")\n", "\n", "plt.axhline(0.8)\n", "plt.axhline(0.5)\n", "plt.axhline(0.2);" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Exercise 4\n", "- plot the weights of the males and females using a box plot\n", "- which one is easier to read?\n", "- (remember to put in titles, axes and legends)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "dfpvt = df.pivot(columns = 'Gender', values = 'Weight')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "dfpvt.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "dfpvt.info()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "dfpvt.plot(kind='box')\n", "plt.title('Weight Box Plot')\n", "plt.ylabel(\"Weight (lbs)\");" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Exercise 5\n", "- load the dataset: `../data/titanic-train.csv`\n", "- learn about scattermatrix here: http://pandas.pydata.org/pandas-docs/stable/visualization.html\n", "- display the data using a scattermatrix" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv('../data/titanic-train.csv')\n", "df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from pandas.plotting import scatter_matrix" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "_ = scatter_matrix(df.drop('PassengerId', axis=1), figsize=(10, 10))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "anaconda-cloud": {}, "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.10" } }, "nbformat": 4, "nbformat_minor": 1 }