add all files
This commit is contained in:
404
solutions/2 Data exploration Exercises Solution.ipynb
Normal file
404
solutions/2 Data exploration Exercises Solution.ipynb
Normal file
@@ -0,0 +1,404 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import numpy as np\n",
|
||||
"\n",
|
||||
"%matplotlib inline\n",
|
||||
"import matplotlib.pyplot as plt\n",
|
||||
"\n",
|
||||
"import pandas as pd"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Exercise 1\n",
|
||||
"- load the dataset: `../data/international-airline-passengers.csv`\n",
|
||||
"- inspect it using the `.info()` and `.head()` commands\n",
|
||||
"- use the function `pd.to_datetime()` to change the column type of 'Month' to a datatime type\n",
|
||||
"- set the index of df to be a datetime index using the column 'Month' and the `df.set_index()` method\n",
|
||||
"- choose the appropriate plot and display the data\n",
|
||||
"- choose appropriate scale\n",
|
||||
"- label the axes"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# - load the dataset: ../data/international-airline-passengers.csv\n",
|
||||
"df = pd.read_csv('../data/international-airline-passengers.csv')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# - inspect it using the .info() and .head() commands\n",
|
||||
"df.info()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"df.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# - use the function to_datetime() to change the column type of 'Month' to a datatime type\n",
|
||||
"# - set the index of df to be a datetime index using the column 'Month' and tthe set_index() method\n",
|
||||
"\n",
|
||||
"df['Month'] = pd.to_datetime(df['Month'])\n",
|
||||
"df = df.set_index('Month')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"df.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# - choose the appropriate plot and display the data\n",
|
||||
"# - choose appropriate scale\n",
|
||||
"# - label the axes\n",
|
||||
"\n",
|
||||
"df.plot();"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Exercise 2\n",
|
||||
"- load the dataset: `../data/weight-height.csv`\n",
|
||||
"- inspect it\n",
|
||||
"- plot it using a scatter plot with Weight as a function of Height\n",
|
||||
"- plot the male and female populations with 2 different colors on a new scatter plot\n",
|
||||
"- remember to label the axes"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# - load the dataset: ../data/weight-height.csv\n",
|
||||
"# - inspect it\n",
|
||||
"df = pd.read_csv('../data/weight-height.csv')\n",
|
||||
"df.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"df.info()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"df.describe()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"df['Gender'].value_counts()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# - plot it using a scatter plot with Weight as a function of Height\n",
|
||||
"_ = df.plot(kind='scatter', x='Height', y='Weight');"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# - plot the male and female populations with 2 different colors on a new scatter plot\n",
|
||||
"# - remember to label the axes\n",
|
||||
"\n",
|
||||
"# this can be done in several ways, showing 2 here:\n",
|
||||
"males = df[df['Gender'] == 'Male']\n",
|
||||
"females = df.query('Gender == \"Female\"')\n",
|
||||
"fig, ax = plt.subplots()\n",
|
||||
"\n",
|
||||
"males.plot(kind='scatter', x='Height', y='Weight',\n",
|
||||
" ax=ax, color='blue', alpha=0.3,\n",
|
||||
" title='Male & Female Populations')\n",
|
||||
"\n",
|
||||
"females.plot(kind='scatter', x='Height', y='Weight',\n",
|
||||
" ax=ax, color='red', alpha=0.3);"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"df['Gendercolor'] = df['Gender'].map({'Male': 'blue', 'Female': 'red'})\n",
|
||||
"df.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"df.plot(kind='scatter', \n",
|
||||
" x='Height',\n",
|
||||
" y='Weight',\n",
|
||||
" c=df['Gendercolor'],\n",
|
||||
" alpha=0.3,\n",
|
||||
" title='Male & Female Populations');"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"fig, ax = plt.subplots()\n",
|
||||
"ax.plot(males['Height'], males['Weight'], 'ob', \n",
|
||||
" females['Height'], females['Weight'], 'or', alpha=0.3)\n",
|
||||
"plt.xlabel('Height')\n",
|
||||
"plt.ylabel('Weight')\n",
|
||||
"plt.title('Male & Female Populations');"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"source": [
|
||||
"## Exercise 3\n",
|
||||
"- plot the histogram of the heights for males and for females on the same plot\n",
|
||||
"- use alpha to control transparency in the plot comand\n",
|
||||
"- plot a vertical line at the mean of each population using `plt.axvline()`"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"males['Height'].plot(kind='hist',\n",
|
||||
" bins=50,\n",
|
||||
" range=(50, 80),\n",
|
||||
" alpha=0.3,\n",
|
||||
" color='blue')\n",
|
||||
"\n",
|
||||
"females['Height'].plot(kind='hist',\n",
|
||||
" bins=50,\n",
|
||||
" range=(50, 80),\n",
|
||||
" alpha=0.3,\n",
|
||||
" color='red')\n",
|
||||
"\n",
|
||||
"plt.title('Height distribution')\n",
|
||||
"plt.legend([\"Males\", \"Females\"])\n",
|
||||
"plt.xlabel(\"Heigth (in)\")\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"plt.axvline(males['Height'].mean(), color='blue', linewidth=2)\n",
|
||||
"plt.axvline(females['Height'].mean(), color='red', linewidth=2);"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"males['Height'].plot(kind='hist',\n",
|
||||
" bins=200,\n",
|
||||
" range=(50, 80),\n",
|
||||
" alpha=0.3,\n",
|
||||
" color='blue',\n",
|
||||
" cumulative=True,\n",
|
||||
" density=True)\n",
|
||||
"\n",
|
||||
"females['Height'].plot(kind='hist',\n",
|
||||
" bins=200,\n",
|
||||
" range=(50, 80),\n",
|
||||
" alpha=0.3,\n",
|
||||
" color='red',\n",
|
||||
" cumulative=True,\n",
|
||||
" density=True)\n",
|
||||
"\n",
|
||||
"plt.title('Height distribution')\n",
|
||||
"plt.legend([\"Males\", \"Females\"])\n",
|
||||
"plt.xlabel(\"Heigth (in)\")\n",
|
||||
"\n",
|
||||
"plt.axhline(0.8)\n",
|
||||
"plt.axhline(0.5)\n",
|
||||
"plt.axhline(0.2);"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Exercise 4\n",
|
||||
"- plot the weights of the males and females using a box plot\n",
|
||||
"- which one is easier to read?\n",
|
||||
"- (remember to put in titles, axes and legends)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"dfpvt = df.pivot(columns = 'Gender', values = 'Weight')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"dfpvt.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"dfpvt.info()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"dfpvt.plot(kind='box')\n",
|
||||
"plt.title('Weight Box Plot')\n",
|
||||
"plt.ylabel(\"Weight (lbs)\");"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Exercise 5\n",
|
||||
"- load the dataset: `../data/titanic-train.csv`\n",
|
||||
"- learn about scattermatrix here: http://pandas.pydata.org/pandas-docs/stable/visualization.html\n",
|
||||
"- display the data using a scattermatrix"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"df = pd.read_csv('../data/titanic-train.csv')\n",
|
||||
"df.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from pandas.plotting import scatter_matrix"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"_ = scatter_matrix(df.drop('PassengerId', axis=1), figsize=(10, 10))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"anaconda-cloud": {},
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.7.10"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 1
|
||||
}
|
||||
Reference in New Issue
Block a user