add all files

This commit is contained in:
Sem van der Hoeven
2021-05-26 15:12:05 +02:00
parent 5c3be10247
commit d979ca38f5
44 changed files with 43574 additions and 0 deletions

View File

@@ -0,0 +1,872 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 9 Improving performance"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"%matplotlib inline\n",
"import matplotlib.pyplot as plt"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Learning curves"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.datasets import load_digits"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"digits = load_digits()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"X, y = digits.data, digits.target"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"X[0]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"X.shape"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"for i in range(8):\n",
" plt.subplot(1,8,i+1)\n",
" plt.imshow(X.reshape(-1, 8, 8)[i], cmap='gray')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from tensorflow.keras.models import Sequential\n",
"from tensorflow.keras.layers import Dense\n",
"from tensorflow.keras.utils import to_categorical\n",
"import tensorflow.keras.backend as K\n",
"from tensorflow.keras.callbacks import EarlyStopping"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"K.clear_session()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"model = Sequential()\n",
"model.add(Dense(16, input_shape=(64,), activation='relu'))\n",
"model.add(Dense(10, activation='softmax'))\n",
"model.compile('adam', 'categorical_crossentropy', metrics=['accuracy'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# store the initial random weights\n",
"initial_weights = model.get_weights()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"y_cat = to_categorical(y, 10)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.model_selection import train_test_split"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"X_train, X_test, y_train, y_test = train_test_split(X, y_cat,\n",
" test_size=0.3)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"X_train.shape"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"train_sizes = (len(X_train) * np.linspace(0.1, 0.999, 4)).astype(int)\n",
"train_sizes"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"train_scores = []\n",
"test_scores = []\n",
"\n",
"for train_size in train_sizes:\n",
" X_train_frac, _, y_train_frac, _ = \\\n",
" train_test_split(X_train, y_train, train_size=train_size)\n",
" \n",
" # at each iteration reset the weights of the model\n",
" # to the initial random weights\n",
" model.set_weights(initial_weights)\n",
" \n",
" h = model.fit(X_train_frac, y_train_frac,\n",
" verbose=0,\n",
" epochs=300,\n",
" callbacks=[EarlyStopping(monitor='loss', patience=1)])\n",
"\n",
" r = model.evaluate(X_train_frac, y_train_frac, verbose=0)\n",
" train_scores.append(r[-1])\n",
" \n",
" e = model.evaluate(X_test, y_test, verbose=0)\n",
" test_scores.append(e[-1])\n",
" \n",
" print(\"Done size: \", train_size)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"plt.plot(train_sizes, train_scores, 'o-', label=\"Training score\")\n",
"plt.plot(train_sizes, test_scores, 'o-', label=\"Test score\")\n",
"plt.legend(loc=\"best\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Batch Normalization"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from tensorflow.keras.layers import BatchNormalization"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def repeated_training(X_train,\n",
" y_train,\n",
" X_test,\n",
" y_test,\n",
" units=512,\n",
" activation='sigmoid',\n",
" optimizer='sgd',\n",
" do_bn=False,\n",
" epochs=10,\n",
" repeats=3):\n",
" histories = []\n",
" \n",
" for repeat in range(repeats):\n",
" K.clear_session()\n",
"\n",
" model = Sequential()\n",
" \n",
" # first fully connected layer\n",
" model.add(Dense(units,\n",
" input_shape=X_train.shape[1:],\n",
" kernel_initializer='normal',\n",
" activation=activation))\n",
" if do_bn:\n",
" model.add(BatchNormalization())\n",
"\n",
" # second fully connected layer\n",
" model.add(Dense(units,\n",
" kernel_initializer='normal',\n",
" activation=activation))\n",
" if do_bn:\n",
" model.add(BatchNormalization())\n",
"\n",
" # third fully connected layer\n",
" model.add(Dense(units,\n",
" kernel_initializer='normal',\n",
" activation=activation))\n",
" if do_bn:\n",
" model.add(BatchNormalization())\n",
"\n",
" # output layer\n",
" model.add(Dense(10, activation='softmax'))\n",
" \n",
" model.compile(optimizer,\n",
" 'categorical_crossentropy',\n",
" metrics=['accuracy'])\n",
"\n",
" h = model.fit(X_train, y_train,\n",
" validation_data=(X_test, y_test),\n",
" epochs=epochs,\n",
" verbose=0)\n",
" histories.append([h.history['accuracy'], h.history['val_accuracy']])\n",
" print(repeat, end=' ')\n",
"\n",
" histories = np.array(histories)\n",
" \n",
" # calculate mean and standard deviation across repeats:\n",
" mean_acc = histories.mean(axis=0)\n",
" std_acc = histories.std(axis=0)\n",
" print()\n",
" \n",
" return mean_acc[0], std_acc[0], mean_acc[1], std_acc[1]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"mean_acc, std_acc, mean_acc_val, std_acc_val = \\\n",
" repeated_training(X_train, y_train, X_test, y_test, do_bn=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"mean_acc_bn, std_acc_bn, mean_acc_val_bn, std_acc_val_bn = \\\n",
" repeated_training(X_train, y_train, X_test, y_test, do_bn=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def plot_mean_std(m, s):\n",
" plt.plot(m)\n",
" plt.fill_between(range(len(m)), m-s, m+s, alpha=0.1)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"plot_mean_std(mean_acc, std_acc)\n",
"plot_mean_std(mean_acc_val, std_acc_val)\n",
"plot_mean_std(mean_acc_bn, std_acc_bn)\n",
"plot_mean_std(mean_acc_val_bn, std_acc_val_bn)\n",
"plt.ylim(0, 1.01)\n",
"plt.title(\"Batch Normalization Accuracy\")\n",
"plt.xlabel('Epochs')\n",
"plt.ylabel('Accuracy')\n",
"plt.legend(['Train', 'Test', 'Train with Batch Normalization', 'Test with Batch Normalization'], loc='best');\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Weight Regularization & Dropout"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from tensorflow.keras.layers import Dropout"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"model = Sequential()\n",
"model.add(Dropout(0.2, input_shape=X_train.shape[1:]))\n",
"# first fully connected layer\n",
"model.add(Dense(512, kernel_initializer='normal',\n",
" kernel_regularizer='l2', activation='sigmoid'))\n",
"model.add(Dropout(0.4))\n",
"model.add(Dense(10, activation='softmax'))\n",
"\n",
"model.compile('sgd',\n",
" 'categorical_crossentropy',\n",
" metrics=['accuracy'])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Data augmentation"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from tensorflow.keras.preprocessing.image import ImageDataGenerator\n",
"\n",
"generator = ImageDataGenerator(rescale = 1./255,\n",
" width_shift_range=0.1,\n",
" height_shift_range=0.1,\n",
" rotation_range = 20,\n",
" shear_range = 0.3,\n",
" zoom_range = 0.3,\n",
" horizontal_flip = True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"train = generator.flow_from_directory('../data/generator',\n",
" target_size = (128, 128),\n",
" batch_size = 32,\n",
" class_mode = 'binary')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"plt.figure(figsize=(12, 12))\n",
"for i in range(16):\n",
" img, label = train.next()\n",
" plt.subplot(4, 4, i+1)\n",
" plt.imshow(img[0])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Embeddings"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from tensorflow.keras.layers import Embedding"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"model = Sequential()\n",
"model.add(Embedding(input_dim=100, output_dim=2))\n",
"model.compile(loss='binary_crossentropy',\n",
" optimizer='adam',\n",
" metrics=['accuracy'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"model.summary()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"emb = model.predict(np.array([[81, 1, 96, 79],\n",
" [17, 47, 69, 50],\n",
" [49, 3, 12, 88]]))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"emb.shape"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"emb"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Sentiment prediction on movie Reviews"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from tensorflow.keras.datasets import imdb"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"(X_train, y_train), (X_test, y_test) = imdb.load_data('/tmp/imdb.npz',\n",
" num_words=None,\n",
" skip_top=0,\n",
" maxlen=None,\n",
" start_char=1,\n",
" oov_char=2,\n",
" index_from=3)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"X_train.shape"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"X_train[1]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"idx = imdb.get_word_index()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"max(idx.values())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"idx"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"rev_idx = {v+3:k for k,v in idx.items()}"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"rev_idx"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"rev_idx[0] = 'padding_char'\n",
"rev_idx[1] = 'start_char'\n",
"rev_idx[2] = 'oov_char'\n",
"rev_idx[3] = 'unk_char'"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"rev_idx[3]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"y_train[0]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"example_review = ' '.join([rev_idx[word] for word in X_train[0]])\n",
"example_review"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"len(X_train[0])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"len(X_train[1])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"len(X_train[2])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"len(X_train[3])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
"from tensorflow.keras.layers import LSTM"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"maxlen = 100\n",
"\n",
"X_train_pad = pad_sequences(X_train, maxlen=maxlen)\n",
"X_test_pad = pad_sequences(X_test, maxlen=maxlen)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"X_train_pad.shape"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"X_train_pad[0]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"X_train[0]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"max_features = max([max(x) for x in X_train_pad] + \n",
" [max(x) for x in X_test_pad]) + 1\n",
"max_features"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"y_train"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"model = Sequential()\n",
"model.add(Embedding(max_features, 128))\n",
"model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))\n",
"model.add(Dense(1, activation='sigmoid'))\n",
"\n",
"model.compile(loss='binary_crossentropy',\n",
" optimizer='adam',\n",
" metrics=['accuracy'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"model.fit(X_train_pad, y_train,\n",
" batch_size=32,\n",
" epochs=2,\n",
" validation_split=0.3)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"score, acc = model.evaluate(X_test_pad, y_test)\n",
"print('Test score:', score)\n",
"print('Test accuracy:', acc)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Exercise 1\n",
"\n",
"- Reload the IMDB data keeping only the first 20000 most common words\n",
"- pad the reviews to a shorter length (eg. 70 or 80), this time make sure you keep the first part of the review if it's longer than the maximum length\n",
"- re run the model (remember to set max_features correctly)\n",
"- does it train faster this time?\n",
"- do you get a better performance?"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Exercise 2\n",
"\n",
"- Reload the digits data as above\n",
"- define a function repeated_training_reg_dropout that adds regularization and dropout to a fully connected network\n",
"- compare the performance with/witouth dropout and regularization like we did for batch normalization\n",
"- do you get a better performance?"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Exercise 3\n",
"\n",
"This is a very long and complex exercise, that should give you an idea of a real world scenario. Feel free to look at the solution if you feel lost. Also, feel free to run this on Floyd with a GPU, in which case you don't need to download the data.\n",
"\n",
"If you are running this locally, download and unpack the male/female pictures from [here](https://www.dropbox.com/s/nov493om2jmh2gp/male_female.tgz?dl=0). These images and labels were obtained from [Crowdflower](https://www.crowdflower.com/data-for-everyone/).\n",
"\n",
"Your goal is to build an image classifier that will recognize the gender of a person from pictures.\n",
"\n",
"- Have a look at the directory structure and inspect a couple of pictures\n",
"- Design a model that will take a color image of size 64x64 as input and return a binary output (female=0/male=1)\n",
"- Feel free to introduce any regularization technique in your model (Dropout, Batch Normalization, Weight Regularization)\n",
"- Compile your model with an optimizer of your choice\n",
"- Using `ImageDataGenerator`, define a train generator that will augment your images with some geometric transformations. Feel free to choose the parameters that make sense to you.\n",
"- Define also a test generator, whose only purpose is to rescale the pixels by 1./255\n",
"- use the function `flow_from_directory` to generate batches from the train and test folders. Make sure you set the `target_size` to 64x64.\n",
"- Use the `model.fit_generator` function to fit the model on the batches generated from the ImageDataGenerator. Since you are streaming and augmenting the data in real time you will have to decide how many batches make an epoch and how many epochs you want to run\n",
"- Train your model (you should get to at least 85% accuracy)\n",
"- Once you are satisfied with your training, check a few of the misclassified pictures. Are those sensible errors?"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.10"
}
},
"nbformat": 4,
"nbformat_minor": 2
}