{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "\n",
    "%matplotlib inline\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Exercise 1\n",
    "- load the dataset: `../data/international-airline-passengers.csv`\n",
    "- inspect it using the `.info()` and `.head()` commands\n",
    "- use the function `pd.to_datetime()` to change the column type of 'Month' to a datatime type\n",
    "- set the index of df to be a datetime index using the column 'Month' and the `df.set_index()` method\n",
    "- choose the appropriate plot and display the data\n",
    "- choose appropriate scale\n",
    "- label the axes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# - load the dataset: ../data/international-airline-passengers.csv\n",
    "df = pd.read_csv('../data/international-airline-passengers.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# - inspect it using the .info() and .head() commands\n",
    "df.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# - use the function to_datetime() to change the column type of 'Month' to a datatime type\n",
    "# - set the index of df to be a datetime index using the column 'Month' and tthe set_index() method\n",
    "\n",
    "df['Month'] = pd.to_datetime(df['Month'])\n",
    "df = df.set_index('Month')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# - choose the appropriate plot and display the data\n",
    "# - choose appropriate scale\n",
    "# - label the axes\n",
    "\n",
    "df.plot();"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Exercise 2\n",
    "- load the dataset: `../data/weight-height.csv`\n",
    "- inspect it\n",
    "- plot it using a scatter plot with Weight as a function of Height\n",
    "- plot the male and female populations with 2 different colors on a new scatter plot\n",
    "- remember to label the axes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# - load the dataset: ../data/weight-height.csv\n",
    "# - inspect it\n",
    "df = pd.read_csv('../data/weight-height.csv')\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "df['Gender'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# - plot it using a scatter plot with Weight as a function of Height\n",
    "_ = df.plot(kind='scatter', x='Height', y='Weight');"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# - plot the male and female populations with 2 different colors on a new scatter plot\n",
    "# - remember to label the axes\n",
    "\n",
    "# this can be done in several ways, showing 2 here:\n",
    "males = df[df['Gender'] == 'Male']\n",
    "females = df.query('Gender == \"Female\"')\n",
    "fig, ax = plt.subplots()\n",
    "\n",
    "males.plot(kind='scatter', x='Height', y='Weight',\n",
    "           ax=ax, color='blue', alpha=0.3,\n",
    "           title='Male & Female Populations')\n",
    "\n",
    "females.plot(kind='scatter', x='Height', y='Weight',\n",
    "             ax=ax, color='red', alpha=0.3);"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['Gendercolor'] = df['Gender'].map({'Male': 'blue', 'Female': 'red'})\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.plot(kind='scatter', \n",
    "        x='Height',\n",
    "        y='Weight',\n",
    "        c=df['Gendercolor'],\n",
    "        alpha=0.3,\n",
    "        title='Male & Female Populations');"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig, ax = plt.subplots()\n",
    "ax.plot(males['Height'], males['Weight'], 'ob', \n",
    "        females['Height'], females['Weight'], 'or', alpha=0.3)\n",
    "plt.xlabel('Height')\n",
    "plt.ylabel('Weight')\n",
    "plt.title('Male & Female Populations');"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "## Exercise 3\n",
    "- plot the histogram of the heights for males and for females on the same plot\n",
    "- use alpha to control transparency in the plot comand\n",
    "- plot a vertical line at the mean of each population using `plt.axvline()`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "males['Height'].plot(kind='hist',\n",
    "                     bins=50,\n",
    "                     range=(50, 80),\n",
    "                     alpha=0.3,\n",
    "                     color='blue')\n",
    "\n",
    "females['Height'].plot(kind='hist',\n",
    "                       bins=50,\n",
    "                       range=(50, 80),\n",
    "                       alpha=0.3,\n",
    "                       color='red')\n",
    "\n",
    "plt.title('Height distribution')\n",
    "plt.legend([\"Males\", \"Females\"])\n",
    "plt.xlabel(\"Heigth (in)\")\n",
    "\n",
    "\n",
    "plt.axvline(males['Height'].mean(), color='blue', linewidth=2)\n",
    "plt.axvline(females['Height'].mean(), color='red', linewidth=2);"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "males['Height'].plot(kind='hist',\n",
    "                     bins=200,\n",
    "                     range=(50, 80),\n",
    "                     alpha=0.3,\n",
    "                     color='blue',\n",
    "                     cumulative=True,\n",
    "                     density=True)\n",
    "\n",
    "females['Height'].plot(kind='hist',\n",
    "                       bins=200,\n",
    "                       range=(50, 80),\n",
    "                       alpha=0.3,\n",
    "                       color='red',\n",
    "                       cumulative=True,\n",
    "                       density=True)\n",
    "\n",
    "plt.title('Height distribution')\n",
    "plt.legend([\"Males\", \"Females\"])\n",
    "plt.xlabel(\"Heigth (in)\")\n",
    "\n",
    "plt.axhline(0.8)\n",
    "plt.axhline(0.5)\n",
    "plt.axhline(0.2);"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Exercise 4\n",
    "- plot the weights of the males and females using a box plot\n",
    "- which one is easier to read?\n",
    "- (remember to put in titles, axes and legends)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "dfpvt = df.pivot(columns = 'Gender', values = 'Weight')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "dfpvt.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "dfpvt.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "dfpvt.plot(kind='box')\n",
    "plt.title('Weight Box Plot')\n",
    "plt.ylabel(\"Weight (lbs)\");"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Exercise 5\n",
    "- load the dataset: `../data/titanic-train.csv`\n",
    "- learn about scattermatrix here: http://pandas.pydata.org/pandas-docs/stable/visualization.html\n",
    "- display the data using a scattermatrix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv('../data/titanic-train.csv')\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pandas.plotting import scatter_matrix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "_ = scatter_matrix(df.drop('PassengerId', axis=1), figsize=(10, 10))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}