{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Session 1 - Your \"first\" DataScience problem"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
""
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# some imports\n",
"\n",
"from IPython.core.display import display, HTML\n",
"display(HTML(\"\"))\n",
"\n",
"# Python ≥3.5 is required\n",
"import sys\n",
"assert sys.version_info >= (3, 5)\n",
" \n",
"# Scikit-Learn ≥0.20 is required\n",
"import sklearn\n",
"assert sklearn.__version__ >= \"0.20\"\n",
"\n",
"# Common imports\n",
"import numpy as np\n",
"import os\n",
"\n",
"# to make this notebook's output stable across runs\n",
"np.random.seed(42)\n",
"\n",
"# To plot pretty figures\n",
"%matplotlib inline\n",
"import matplotlib as mpl\n",
"import matplotlib.pyplot as plt\n",
"mpl.rc('axes', labelsize=14)\n",
"mpl.rc('xtick', labelsize=12)\n",
"mpl.rc('ytick', labelsize=12)\n",
"plt.rc('font', size=12) \n",
"plt.rc('figure', figsize = (12, 5))\n",
"\n",
"# Settings for the visualizations\n",
"import seaborn as sns\n",
"sns.set_style(\"whitegrid\")\n",
"sns.set_context(\"notebook\", font_scale=1, rc={\"lines.linewidth\": 2,'font.family': [u'times']})\n",
"\n",
"import pandas as pd\n",
"pd.set_option('display.max_rows', 25)\n",
"pd.set_option('display.max_columns', 500)\n",
"pd.set_option('display.max_colwidth', 50)\n",
"\n",
"# Ignore useless warnings (see SciPy issue #5998)\n",
"import warnings\n",
"warnings.filterwarnings(action=\"ignore\", message=\"^internal gelsd\")\n",
"\n",
"# create output folder\n",
"if not os.path.exists('output'):\n",
" os.makedirs('output')\n",
"if not os.path.exists('output/session1'):\n",
" os.makedirs('output/session1')"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"## load data\n",
"train_set = pd.read_csv('dataset/housing-snapshot/train_set.csv',index_col=0) \n",
"test_set = pd.read_csv('dataset/housing-snapshot/test_set.csv',index_col=0) \n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## The problem\n",
"The machine learning is to predict the house price, but before that it is imporntat to study the dataset and its features"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"There is 5432 samples\n",
"Each sample has 21 features\n"
]
}
],
"source": [
"# print the dataset size\n",
"print(\"There is\", train_set.shape[0], \"samples\")\n",
"print(\"Each sample has\", train_set.shape[1], \"features\")"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Suburb | \n",
" Address | \n",
" Rooms | \n",
" Type | \n",
" Price | \n",
" Method | \n",
" SellerG | \n",
" Date | \n",
" Distance | \n",
" Postcode | \n",
" Bedroom2 | \n",
" Bathroom | \n",
" Car | \n",
" Landsize | \n",
" BuildingArea | \n",
" YearBuilt | \n",
" CouncilArea | \n",
" Lattitude | \n",
" Longtitude | \n",
" Regionname | \n",
" Propertycount | \n",
"
\n",
" \n",
" index | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" Aberfeldie | \n",
" 241 Buckley St | \n",
" 4 | \n",
" h | \n",
" 1380000.0 | \n",
" VB | \n",
" Nelson | \n",
" 12/08/2017 | \n",
" 7.5 | \n",
" 3040.0 | \n",
" 4.0 | \n",
" 2.0 | \n",
" 2.0 | \n",
" 766.0 | \n",
" NaN | \n",
" NaN | \n",
" Moonee Valley | \n",
" -37.75595 | \n",
" 144.90551 | \n",
" Western Metropolitan | \n",
" 1543.0 | \n",
"
\n",
" \n",
" 1 | \n",
" Northcote | \n",
" 67 Charles St | \n",
" 2 | \n",
" h | \n",
" 1100000.0 | \n",
" SP | \n",
" Jellis | \n",
" 20/05/2017 | \n",
" 5.5 | \n",
" 3070.0 | \n",
" 2.0 | \n",
" 1.0 | \n",
" 1.0 | \n",
" 189.0 | \n",
" NaN | \n",
" NaN | \n",
" Darebin | \n",
" -37.77670 | \n",
" 144.99240 | \n",
" Northern Metropolitan | \n",
" 11364.0 | \n",
"
\n",
" \n",
" 2 | \n",
" Balwyn North | \n",
" 42 Maud St | \n",
" 3 | \n",
" h | \n",
" 1480000.0 | \n",
" PI | \n",
" Jellis | \n",
" 15/10/2016 | \n",
" 9.2 | \n",
" 3104.0 | \n",
" 3.0 | \n",
" 1.0 | \n",
" 4.0 | \n",
" 605.0 | \n",
" 116.0 | \n",
" 1950.0 | \n",
" Boroondara | \n",
" -37.79510 | \n",
" 145.06960 | \n",
" Southern Metropolitan | \n",
" 7809.0 | \n",
"
\n",
" \n",
" 3 | \n",
" Brunswick | \n",
" 13 Percy St | \n",
" 3 | \n",
" h | \n",
" 1055000.0 | \n",
" S | \n",
" Nelson | \n",
" 7/05/2016 | \n",
" 5.2 | \n",
" 3056.0 | \n",
" 3.0 | \n",
" 1.0 | \n",
" 1.0 | \n",
" 324.0 | \n",
" NaN | \n",
" 1930.0 | \n",
" Moreland | \n",
" -37.76530 | \n",
" 144.95860 | \n",
" Northern Metropolitan | \n",
" 11918.0 | \n",
"
\n",
" \n",
" 4 | \n",
" Templestowe Lower | \n",
" 253 Thompsons Rd | \n",
" 4 | \n",
" h | \n",
" 1000000.0 | \n",
" VB | \n",
" hockingstuart | \n",
" 13/08/2016 | \n",
" 13.8 | \n",
" 3107.0 | \n",
" 4.0 | \n",
" 3.0 | \n",
" 2.0 | \n",
" 728.0 | \n",
" 164.0 | \n",
" 1970.0 | \n",
" Manningham | \n",
" -37.76800 | \n",
" 145.10270 | \n",
" Eastern Metropolitan | \n",
" 5420.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Suburb Address Rooms Type Price Method \\\n",
"index \n",
"0 Aberfeldie 241 Buckley St 4 h 1380000.0 VB \n",
"1 Northcote 67 Charles St 2 h 1100000.0 SP \n",
"2 Balwyn North 42 Maud St 3 h 1480000.0 PI \n",
"3 Brunswick 13 Percy St 3 h 1055000.0 S \n",
"4 Templestowe Lower 253 Thompsons Rd 4 h 1000000.0 VB \n",
"\n",
" SellerG Date Distance Postcode Bedroom2 Bathroom Car \\\n",
"index \n",
"0 Nelson 12/08/2017 7.5 3040.0 4.0 2.0 2.0 \n",
"1 Jellis 20/05/2017 5.5 3070.0 2.0 1.0 1.0 \n",
"2 Jellis 15/10/2016 9.2 3104.0 3.0 1.0 4.0 \n",
"3 Nelson 7/05/2016 5.2 3056.0 3.0 1.0 1.0 \n",
"4 hockingstuart 13/08/2016 13.8 3107.0 4.0 3.0 2.0 \n",
"\n",
" Landsize BuildingArea YearBuilt CouncilArea Lattitude \\\n",
"index \n",
"0 766.0 NaN NaN Moonee Valley -37.75595 \n",
"1 189.0 NaN NaN Darebin -37.77670 \n",
"2 605.0 116.0 1950.0 Boroondara -37.79510 \n",
"3 324.0 NaN 1930.0 Moreland -37.76530 \n",
"4 728.0 164.0 1970.0 Manningham -37.76800 \n",
"\n",
" Longtitude Regionname Propertycount \n",
"index \n",
"0 144.90551 Western Metropolitan 1543.0 \n",
"1 144.99240 Northern Metropolitan 11364.0 \n",
"2 145.06960 Southern Metropolitan 7809.0 \n",
"3 144.95860 Northern Metropolitan 11918.0 \n",
"4 145.10270 Eastern Metropolitan 5420.0 "
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# print the top elements from the dataset\n",
"train_set.head()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"# As it can be seen the database contains several features, some of them numerical and some of them are categorical.\n",
"# It is important to check each of the to understand it."
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Suburb object\n",
"Address object\n",
"Rooms int64\n",
"Type object\n",
"Price float64\n",
"Method object\n",
"SellerG object\n",
"Date object\n",
"Distance float64\n",
"Postcode float64\n",
"Bedroom2 float64\n",
"Bathroom float64\n",
"Car float64\n",
"Landsize float64\n",
"BuildingArea float64\n",
"YearBuilt float64\n",
"CouncilArea object\n",
"Lattitude float64\n",
"Longtitude float64\n",
"Regionname object\n",
"Propertycount float64\n",
"dtype: object"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# we can see the type of each features as follows\n",
"train_set.dtypes"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Suburb | \n",
" Address | \n",
" Type | \n",
" Method | \n",
" SellerG | \n",
" Date | \n",
" CouncilArea | \n",
" Regionname | \n",
"
\n",
" \n",
" index | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" Aberfeldie | \n",
" 241 Buckley St | \n",
" h | \n",
" VB | \n",
" Nelson | \n",
" 12/08/2017 | \n",
" Moonee Valley | \n",
" Western Metropolitan | \n",
"
\n",
" \n",
" 1 | \n",
" Northcote | \n",
" 67 Charles St | \n",
" h | \n",
" SP | \n",
" Jellis | \n",
" 20/05/2017 | \n",
" Darebin | \n",
" Northern Metropolitan | \n",
"
\n",
" \n",
" 2 | \n",
" Balwyn North | \n",
" 42 Maud St | \n",
" h | \n",
" PI | \n",
" Jellis | \n",
" 15/10/2016 | \n",
" Boroondara | \n",
" Southern Metropolitan | \n",
"
\n",
" \n",
" 3 | \n",
" Brunswick | \n",
" 13 Percy St | \n",
" h | \n",
" S | \n",
" Nelson | \n",
" 7/05/2016 | \n",
" Moreland | \n",
" Northern Metropolitan | \n",
"
\n",
" \n",
" 4 | \n",
" Templestowe Lower | \n",
" 253 Thompsons Rd | \n",
" h | \n",
" VB | \n",
" hockingstuart | \n",
" 13/08/2016 | \n",
" Manningham | \n",
" Eastern Metropolitan | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Suburb Address Type Method SellerG \\\n",
"index \n",
"0 Aberfeldie 241 Buckley St h VB Nelson \n",
"1 Northcote 67 Charles St h SP Jellis \n",
"2 Balwyn North 42 Maud St h PI Jellis \n",
"3 Brunswick 13 Percy St h S Nelson \n",
"4 Templestowe Lower 253 Thompsons Rd h VB hockingstuart \n",
"\n",
" Date CouncilArea Regionname \n",
"index \n",
"0 12/08/2017 Moonee Valley Western Metropolitan \n",
"1 20/05/2017 Darebin Northern Metropolitan \n",
"2 15/10/2016 Boroondara Southern Metropolitan \n",
"3 7/05/2016 Moreland Northern Metropolitan \n",
"4 13/08/2016 Manningham Eastern Metropolitan "
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# print those categorical features\n",
"train_set.select_dtypes(include=['object']).head()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"h 3765\n",
"u 1191\n",
"t 476\n",
"Name: Type, dtype: int64"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# We can check how many different type there is in the dataset using the folliwing line\n",
"train_set[\"Type\"].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
""
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAskAAAE/CAYAAAC0Fl50AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuNCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8QVMy6AAAACXBIWXMAAAsTAAALEwEAmpwYAAAUSElEQVR4nO3df6zVdf3A8dfh/pKE5MegSNCMBDda5SR+FMmAAY7LlUUwiXaFWLYVM9OscYnV1kgcc2WgNbcMf9GAIUoYoGaLLbjDZCbTMMzklxA/hCEQ3J+f7x/frzd9y/1e9N5zzwEej788x+P9vM5r7+FzH8895rIsywIAAGjRpdADAABAsRHJAACQEMkAAJAQyQAAkBDJAACQKC30AKnm5uY4depUlJWVRS6XK/Q4AABcgLIsi4aGhrj00kujS5f33zcuukg+depU7Ny5s9BjAABwERg0aFB07979fc8XXSSXlZVFxP8OXF5eXuBpLhwvv/xyfOYznyn0GBcM++x4dtrx7LRj2WfHs9OOZ6fnrr6+Pnbu3NnSnqmii+R3PmJRXl4eFRUVBZ7mwmKfHcs+O56ddjw77Vj22fHstOPZ6QfT2sd7/eIeAAAkRDIAACREMgAAJEQyAAAkRDIAACREMgAAJHJZlmWFHuLd6urqWr7jz1eYAABc2M40NcUlJSWdft22mrPovif5HcPWrYtDjY2FHgMAgDw6MGNGoUc4Kx+3AACAhEgGAICESAYAgIRIBgCAhEgGAICESAYAgIRIBgCAhEgGAICESAYAgIRIBgCAhEgGAICESAYAgIRIBgCAhEgGAICESAYAgIRIBgCAhEgGAICESAYAgIRIBgCAhEgGAICESAYAgIRIBgCAhEgGAICESAYAgIRIBgCAhEgGAICESAYAgIRIBgCAhEgGAICESAYAgIRIBgCARN4ieevWrVFdXZ2vHw8AAHnjTjIAACRK8/nDjx49Grfcckvs2bMnrrrqqliyZEmUl5fn85IAANBueb2TvH///vjxj38cGzZsiCNHjsSWLVvyeTkAAOgQeb2TfM0118SAAQMiImLgwIFx7NixfF4OAAA6RF7vJJeW/rfBc7lcZFmWz8sBAECH8It7AACQEMkAAJDI22eShw8fHsOHD295fPfdd+frUgAA0KHcSQYAgIRIBgCAhEgGAICESAYAgIRIBgCAhEgGAICESAYAgIRIBgCAhEgGAICESAYAgIRIBgCAhEgGAICESAYAgIRIBgCAhEgGAICESAYAgIRIBgCAhEgGAICESAYAgIRIBgCAhEgGAICESAYAgIRIBgCAhEgGAICESAYAgIRIBgCAhEgGAICESAYAgIRIBgCAhEgGAIBEaaEHaM3zVVVRUVFR6DEAAMijM01NcUlJSaHHeB93ki8S27ZtK/QIFxT77Hh22vHstGPZZ8ez0453Pu60GAM5QiQDAMD7iGQAAEiIZAAASIhkAABIiGQAAEiIZAAASIhkAABIiGQAAEiIZAAASIhkAABIiGQAAEiIZAAASIhkAABIiGQAAEiIZAAASIhkAABIiOSLxHXXXVfoES4oxbDPM01NhR4BAC5YpYUeoDXD1q2LQ42NhR4DitaBGTMKPQIAXLDcSQYAgIRIBgCAhEgGAICESAYAgIRIBgCAhEgGAICESAYAgIRIBgCAhEgGAICESAYAgIRIBgCAhEgGAICESAYAgIRIBgCAhEgGAICESAYAgIRIBgCAhEgGAICESAYAgIRIBgCAhEgGAICESAYAgIRIBgCAhEgGAIDEOUXy9u3bY8WKFVFfXx8vvvhivmcCAICCajOS16xZEzU1NfGb3/wmTpw4Ed/5zndi1apVnTEbAAAURJuR/Oijj8bKlSujW7du0bt371izZk08/PDDnTEbAAAURJuR3KVLl+jWrVvL4379+kVJSUlehwIAgEJqM5J79OgRO3bsiFwuFxERv//97+Oyyy7L+2AAAFAopW29YP78+XHbbbfFnj17YtSoUVFRURG/+tWvOmM2AAAoiDYjeeDAgbF27drYtWtXNDU1xVVXXRVlZWWdMRsAABREm5Hc1NQUK1asiL/85S9RUlISY8eOjalTp3bGbAAAUBBtRvLChQvjn//8Z0yZMiWyLIvVq1fH7t274/bbb++M+QAAoNO1+Yt7mzdvjt/+9rcxbdq0mD59eixbtiw2btzY5g/eunVrVFdXtzyeN29erFmzpn3TAgBAJ2gzknv16hVNTU0tj3O5XHz0ox/N61AAAFBIbX7c4pprromZM2fG1KlTo6SkJNavXx89e/aMZcuWRUTEN77xjbwPCQAAnanNSK6rq4vBgwfHK6+8EhER/fv3j4iInTt3/r//XC6XiyzLWh43NDS0Z04AAOg0bUby+PHjY8yYMS3/M5Fz1bNnz9i7d2/U1dXF6dOnY9u2bfGlL33pQw8KAACdpc3PJD/66KMxbty4+PWvfx2HDx8+5x989dVXx+jRo6OysjJuu+22uO6669o1KAAAdJY27yQvW7Ys9uzZE6tWrYrp06fHZz/72fja174WI0eObPOH//SnP+2QIQEAoDO1eSc5IuKKK66I22+/PWpqauKVV16JO+64I6qqqmL79u35ng8AADpdq3eS6+vro7y8PHbv3h2rVq2KtWvXxuDBg2P+/PkxZsyYeOmll+J73/te/OlPf+rMeQEAIO9avZN80003RUTE9OnTo7GxMR577LF48MEHY9y4cdGlS5e49tprY9iwYZ02KAAAdJZW7yS/8/VtmzZtiq5du571NXfffXd+pgIAgAJqNZLr6uri73//+3u+6/jdhgwZkrehAACgkFqN5L1798att9561kjO5XLx3HPP5XUwAAAolFYj+dOf/nQ8+eSTnTgKAAAUh3P6CjgAALiYtBrJQ4cO7cw5AACgaLQayQsWLOjMOQAAoGj4uAUAACREMgAAJEQyAAAkRDIAACREMgAAJEQyAAAkRDIAACREMgAAJEQyAAAkRDIAACREMgAAJEQyAAAkRDIAACREMgAAJEQyAAAkRDIAACREMgAAJEQyAAAkRDIAACREMgAAJEQyAAAkSgs9QGuer6qKioqKQo8BRetMU1NcUlJS6DEA4ILkTvJFYtu2bYUe4YJSDPsUyACQPyIZAAASIhkAABIiGQAAEiIZAAASIhkAABIiGQAAEiIZAAASIhkAABIiGQAAEiIZAAASIhkAABIiGQAAEiIZAAASIhkAABIiGQAAEiIZAAASIvkCdKapqdAjAACc10oLPUBrhq1bF4caGws9xnnpwIwZhR4BAOC85k4yAAAkRDIAACREMgAAJEQyAAAkRDIAACREMgAAJEQyAAAkRDIAACREMgAAJEQyAAAkRDIAACREMgAAJEQyAAAkRDIAACREMgAAJEQyAAAkRDIAACREMgAAJEQyAAAkRDIAACREMgAAJEQyAAAkRDIAACREMgAAJEQyAAAkRDIAACREMgAAJEQyAAAkRDIAACREMgAAJEQyAAAk8h7JJ06ciLlz5+b7MgAA0GHyHsnHjx+PHTt25PsyAADQYfIeyQsXLoxDhw65mwwAwHkj75G8YMGC6Nu3b9x///35vhQAAHQIv7gHAAAJkQwAAIm8R3JpaWk0Njbm+zIAANBh8h7JvXv3jk984hNRXV2d70sBAECHKM33BcrKymLFihX5vgwAAHQYn0kGAICESAYAgIRIBgCAhEgGAICESAYAgIRIBgCAhEgGAICESAYAgIRIBgCAhEgGAICESAYAgIRIBgCAhEgGAICESAYAgIRIBgCAhEgGAICESAYAgIRIBgCAhEgGAICESAYAgIRIBgCAhEgGAICESAYAgIRIBgCAhEgGAICESAYAgIRIBgCAhEgGAICESAYAgIRIBgCARGmhB2jN81VVUVFRUegxzktnmprikpKSQo8BAHDecif5AiSQAQDaRyQDAEBCJAMAQEIkAwBAQiQDAEBCJAMAQEIkAwBAoui+JznLsoiIqK+vL/AkF566urpCj3BBsc+OZ6cdz047ln12PDvteHZ6bt5pzXfaM5XLWvs7BXLixInYuXNnoccAAOAiMGjQoOjevfv7ni+6SG5ubo5Tp05FWVlZ5HK5Qo8DAMAFKMuyaGhoiEsvvTS6dHn/J5CLLpIBAKDQ/OIeAAAkRDIAACREMgAAJEQyAAAkRDIAACREMgAAJEQyAAAkRDIAACSKKpLXrVsXkyZNigkTJsTy5csLPc55pbq6OiorK2PKlCkxZcqUeOmll2LLli1RVVUVEyZMiF/84hctr92xY0dMnTo1Jk6cGD/60Y+isbGxgJMXl5MnT8bkyZNj3759EREfeIf79++Pr3/963HDDTfEt7/97Th16lRB3kcxSXdaU1MTEyZMaDmrzz77bETY6bm67777orKyMiorK2Px4sUR4Zy2x9n26Yy2zy9/+cuYNGlSVFZWxrJlyyLCGW2vs+3UOe0EWZH497//nY0ZMyY7duxYdurUqayqqip77bXXCj3WeaG5uTkbNWpU1tDQ0PLc6dOns9GjR2d79uzJGhoasjlz5mR//vOfsyzLssrKyuzFF1/MsizLampqsuXLlxdi7KLzt7/9LZs8eXI2ZMiQbO/evR9qh9/61reyp556KsuyLLvvvvuyxYsXF+S9FIt0p1mWZZMnT84OHjz4vtfaads2b96c3XTTTVldXV1WX1+f3Xzzzdm6deuc0w/pbPt85plnnNF22Lp1azZjxoysoaEhO336dDZmzJhsx44dzmg7nG2nr7/+unPaCYrmTvKWLVtixIgR0aNHj/jIRz4SEydOjI0bNxZ6rPPCv/71r4iImDNnTtx4443x2GOPxfbt2+PKK6+MAQMGRGlpaVRVVcXGjRvjzTffjDNnzsTnP//5iIiYOnWqPf+fVatWxU9+8pPo27dvRMQH3mFDQ0P89a9/jYkTJ77n+YtZutPTp0/H/v37Y/78+VFVVRVLliyJ5uZmOz1Hffr0iXnz5kV5eXmUlZXFwIEDY9euXc7ph3S2fe7fv98ZbYdhw4bFI488EqWlpfHWW29FU1NTvP32285oO5xtp5dccolz2gmKJpIPHToUffr0aXnct2/fOHjwYAEnOn+8/fbbMXLkyLj//vvjoYceihUrVsT+/fvPus90z3369LHn//Ozn/0shg4d2vK4tTPZ2g6PHTsW3bp1i9LS0vc8fzFLd3rkyJEYMWJE3HXXXbFq1ap44YUXYvXq1XZ6jq6++uqWf/nt2rUrNmzYELlczjn9kM62zy9/+cvOaDuVlZXFkiVLorKyMkaOHOnP0g6Q7rSxsdE57QRFE8nNzc2Ry+VaHmdZ9p7HtO7aa6+NxYsXR/fu3aNXr14xbdq0WLJkyVn3ac/nrrVdtfb82XZpt+81YMCAuP/++6Nv377RtWvXqK6ujk2bNtnpB/Taa6/FnDlz4oc//GEMGDDAOW2nd+/zU5/6lDPaAb773e9GbW1tHDhwIHbt2uWMdoB377S2ttY57QRFE8kf//jH4/Dhwy2PDx8+3PKfaPn/vfDCC1FbW9vyOMuyuPzyy8+6z3TPR44csedWtHYmW9thr1694sSJE9HU1PSe1/Nf//jHP+Lpp59ueZxlWZSWltrpB7Bt27aYPXt2fP/734+vfOUrzmk7pft0Rtvn9ddfjx07dkRERNeuXWPChAmxdetWZ7QdzrbT9evXO6edoGgi+Ytf/GLU1tbG0aNH4/Tp0/HMM8/E9ddfX+ixzgsnTpyIxYsXR11dXZw8eTKeeOKJuOOOO+KNN96I3bt3R1NTUzz11FNx/fXXx+WXXx4VFRWxbdu2iIhYu3atPbfic5/73AfaYVlZWQwdOjTWr18fERFPPvmk3SayLIu77rorjh8/Hg0NDbFy5coYP368nZ6jAwcOxNy5c+Oee+6JysrKiHBO2+Ns+3RG22ffvn2xYMGCqK+vj/r6+njuuedixowZzmg7nG2nX/jCF5zTTpDLsiwr9BDvWLduXTzwwAPR0NAQ06ZNi1tuuaXQI5037r333nj66aejubk5Zs6cGbNmzYra2tpYtGhR1NXVxejRo6OmpiZyuVy8+uqrsWDBgjh58mQMGTIkFi1aFOXl5YV+C0Vj7Nix8cgjj0T//v0/8A7ffPPNmDdvXrz11lvRr1+/+PnPfx6XXXZZod9Swb17p8uXL4/ly5dHY2NjTJgwIe68886ICDs9BwsXLozHH388rrjiipbnZsyYEZ/85Ced0w+htX02Nzc7o+2wdOnS2LBhQ5SUlMSECRPi1ltv9WdpO51tp/4szb+iimQAACgGRfNxCwAAKBYiGQAAEiIZAAASIhkAABIiGQAAEiIZ4CI0Z86cOHr0aKHHAChaIhngIrR58+ZCjwBQ1EQyQJFZvXp1VFZWRlVVVdx8881x4MCBWLlyZUyePDluvPHGmDNnTrzxxhsRETFv3rx48MEHW/7Zdz8eO3ZsLF26NGbOnBljxoyJe++9NyIiampqIiJi1qxZceDAgc59cwDnidJCDwDAf7366qtxzz33xBNPPBH9+vWLhx56KGbPnh3Nzc2xcuXK6NWrV6xZsybmzp0bf/jDH9r8ef/5z3/id7/7XRw8eDDGjx8fX/3qV2PRokWxZs2aePjhh6NXr16d8K4Azj/uJAMUkdra2hg1alT069cvIiJmz54d48aNi0mTJrUE7dSpU+PgwYOxb9++Nn/euHHjIiLiYx/7WPTu3TuOHz+ev+EBLiAiGaCIlJSURC6Xa3l85syZ2Lt37/tel2VZNDY2Ri6XiyzLWp5vaGh4z+sqKipa/jp9LQCtE8kARWT48OFRW1sbhw4dioiIFStWxKZNm2L9+vUt30bx+OOPR48ePeLKK6+Mnj17xssvvxwREQcPHoznn3/+nK5TUlISjY2N+XkTABcAn0kGKCKDBw+OH/zgB/HNb34zIiL69OkTzz77bPzxj3+MWbNmRXNzc/Tq1SseeOCB6NKlS1RXV8edd94ZEydOjP79+8eIESPO6To33HBDVFdXx9KlS2PQoEH5fEsA56Vc5r+9AQDAe/i4BQAAJEQyAAAkRDIAACREMgAAJEQyAAAkRDIAACREMgAAJP4HEnpfPrWD1WIAAAAASUVORK5CYII=\n",
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"sns.countplot(y=\"Type\", data=train_set, color=\"c\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"It would be interesting to visualize all features (numerical and catergorical) in order to undertand them.\n",
"\n",
"Check out this blog for plotting distribution: https://seaborn.pydata.org/tutorial/distributions.html\n",
"+ Seaborn version of this blog can be different from the one intalled in your machine (version 0.11 has been just realeased)\n",
"\n",
"Check out this blog for plotting categorical data: https://seaborn.pydata.org/tutorial/categorical.html"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"sns.histplot(train_set[\"Price\"])\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## BASELINE MODEL\n",
"### https://www.kaggle.com/t/b02bcc4763e14d0486715e7c8bc946fe\n",
"\n",
"This is a simple model that uses the K-nearest Neighbors Regressor\n",
"\n",
"This model only uses 4 feaures: 'Rooms','Landsize', 'BuildingArea', 'YearBuilt'"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"----- EVALUATION ON TRAIN SET ------\n",
"RMSE 397813.8038017269\n",
"R^2: 0.6219552116625728\n"
]
},
{
"data": {
"image/png": "\n",
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"## the features\n",
"\n",
"features = ['Rooms','Landsize', 'BuildingArea', 'YearBuilt']\n",
"## DEFINE YOUR FEATURES\n",
"X = train_set[features].fillna(0)\n",
"y = train_set[['Price']]\n",
"\n",
"## the model\n",
"# KNeighborsRegressor\n",
"from sklearn import neighbors\n",
"n_neighbors = 3 # you can modify this paramenter (ONLY THIS ONE!!!)\n",
"model = neighbors.KNeighborsRegressor(n_neighbors)\n",
"\n",
"## fit the model\n",
"model.fit(X, y)\n",
"\n",
"## predict training set\n",
"y_pred = model.predict(X)\n",
"\n",
"## Evaluate the model and plot it\n",
"from sklearn.metrics import mean_squared_error, r2_score\n",
"print(\"----- EVALUATION ON TRAIN SET ------\")\n",
"print(\"RMSE\",np.sqrt(mean_squared_error(y, y_pred)))\n",
"print(\"R^2: \",r2_score(y, y_pred))\n",
"\n",
"\n",
"plt.scatter(y, y_pred)\n",
"plt.xlabel('Price')\n",
"plt.ylabel('Predicted price');\n",
"plt.show()\n",
"\n",
"## predict the test set and generate the submission file\n",
"X_test = test_set[features].fillna(0)\n",
"y_pred = model.predict(X_test)\n",
"\n",
"df_output = pd.DataFrame(y_pred)\n",
"df_output = df_output.reset_index()\n",
"df_output.columns = ['index','Price']\n",
"\n",
"df_output.to_csv('output/session1/baseline.csv',index=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## EXERCICE - TRAIN A MODEL and upload your best solution to the Kaggle Challenge\n",
"\n",
"Tasks: \n",
"+ Choose the best features for the problem. Some features are numercial but others categorical, think how to codify all of them.\n",
"+ Choose the model. You have two choice: LinearRegression and KNeighborsRegressor\n",
" + You can chanege the paramenter n_neighbors from the KNeighborsRegressor\n",
" \n",
"LINEAR REGRESSOR:\n",
">from sklearn.linear_model import LinearRegression\n",
"\n",
">model = LinearRegression()\n",
"\n",
"KNeighborsRegressor\n",
"> from sklearn import neighbors\n",
"\n",
"> n_neighbors = 3 # you can modify this paramenter (ONLY THIS ONE!!!)\n",
"\n",
"> model = neighbors.KNeighborsRegressor(n_neighbors=n_neighbors)\n"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"## the features\n",
"\n",
"## the model\n",
"\n",
"## fit the model\n",
"\n",
"## predict training set\n",
"\n",
"## Evaluate the model and plot it\n",
"\n",
"## predict the test set and generate the submission file"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Explain the choosed model and why you think that is is the best"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.7"
}
},
"nbformat": 4,
"nbformat_minor": 4
}