model-registry/notebooks/wip/Linear_Probability_and_logistic_Regression_holdout.ipynb

3290 lines
180 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 56,
"metadata": {
"id": "xwFyEsosINqT"
},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 57,
"metadata": {
"id": "pKewSQysItJ-"
},
"outputs": [],
"source": [
"# https://www.statsmodels.org/stable/index.html\n",
"import statsmodels.api as sm"
]
},
{
"cell_type": "code",
"execution_count": 58,
"metadata": {
"id": "Lz-DyAtNWsJR"
},
"outputs": [],
"source": [
"# Download Dataset from https://www.dropbox.com/scl/fi/32vgpt3jvtztu86avdnwg/Mortgage.xlsx?rlkey=qx1d46hzgn4h67zrcyajdyl3e&dl=1\n",
"# and add it to colab"
]
},
{
"cell_type": "code",
"execution_count": 59,
"metadata": {
"id": "0zM8FGMJXJ70"
},
"outputs": [],
"source": [
"# mortgageDf = pd.read_excel(\"./Mortgage.xlsx\")\n",
"mortgageDf = pd.read_excel(\"https://www.dropbox.com/scl/fi/32vgpt3jvtztu86avdnwg/Mortgage.xlsx?rlkey=qx1d46hzgn4h67zrcyajdyl3e&dl=1\")"
]
},
{
"cell_type": "code",
"execution_count": 60,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 990
},
"id": "t0LUca0Myqw5",
"outputId": "527eb991-fb2c-420a-e8fe-9b983e793560"
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>y</th>\n",
" <th>x1</th>\n",
" <th>x2</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>16.35</td>\n",
" <td>49.94</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>34.43</td>\n",
" <td>56.16</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>39.19</td>\n",
" <td>36.89</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>23.58</td>\n",
" <td>56.88</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0</td>\n",
" <td>29.92</td>\n",
" <td>27.05</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>1</td>\n",
" <td>25.26</td>\n",
" <td>44.38</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>1</td>\n",
" <td>36.51</td>\n",
" <td>48.98</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>1</td>\n",
" <td>11.70</td>\n",
" <td>55.55</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>0</td>\n",
" <td>32.21</td>\n",
" <td>31.28</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>1</td>\n",
" <td>28.74</td>\n",
" <td>35.63</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>1</td>\n",
" <td>18.28</td>\n",
" <td>39.50</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>0</td>\n",
" <td>10.12</td>\n",
" <td>31.39</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>0</td>\n",
" <td>10.39</td>\n",
" <td>29.47</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>0</td>\n",
" <td>21.46</td>\n",
" <td>29.34</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>1</td>\n",
" <td>33.56</td>\n",
" <td>40.37</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>1</td>\n",
" <td>37.91</td>\n",
" <td>22.92</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>1</td>\n",
" <td>31.81</td>\n",
" <td>47.56</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>0</td>\n",
" <td>25.88</td>\n",
" <td>44.58</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>1</td>\n",
" <td>38.40</td>\n",
" <td>47.85</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>0</td>\n",
" <td>26.62</td>\n",
" <td>25.50</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20</th>\n",
" <td>0</td>\n",
" <td>14.36</td>\n",
" <td>21.87</td>\n",
" </tr>\n",
" <tr>\n",
" <th>21</th>\n",
" <td>1</td>\n",
" <td>22.22</td>\n",
" <td>20.79</td>\n",
" </tr>\n",
" <tr>\n",
" <th>22</th>\n",
" <td>1</td>\n",
" <td>32.10</td>\n",
" <td>51.56</td>\n",
" </tr>\n",
" <tr>\n",
" <th>23</th>\n",
" <td>0</td>\n",
" <td>11.75</td>\n",
" <td>32.96</td>\n",
" </tr>\n",
" <tr>\n",
" <th>24</th>\n",
" <td>1</td>\n",
" <td>10.32</td>\n",
" <td>48.59</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25</th>\n",
" <td>0</td>\n",
" <td>11.43</td>\n",
" <td>34.78</td>\n",
" </tr>\n",
" <tr>\n",
" <th>26</th>\n",
" <td>0</td>\n",
" <td>12.58</td>\n",
" <td>33.27</td>\n",
" </tr>\n",
" <tr>\n",
" <th>27</th>\n",
" <td>0</td>\n",
" <td>27.53</td>\n",
" <td>25.63</td>\n",
" </tr>\n",
" <tr>\n",
" <th>28</th>\n",
" <td>1</td>\n",
" <td>36.71</td>\n",
" <td>37.05</td>\n",
" </tr>\n",
" <tr>\n",
" <th>29</th>\n",
" <td>0</td>\n",
" <td>17.85</td>\n",
" <td>26.86</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" y x1 x2\n",
"0 1 16.35 49.94\n",
"1 1 34.43 56.16\n",
"2 1 39.19 36.89\n",
"3 1 23.58 56.88\n",
"4 0 29.92 27.05\n",
"5 1 25.26 44.38\n",
"6 1 36.51 48.98\n",
"7 1 11.70 55.55\n",
"8 0 32.21 31.28\n",
"9 1 28.74 35.63\n",
"10 1 18.28 39.50\n",
"11 0 10.12 31.39\n",
"12 0 10.39 29.47\n",
"13 0 21.46 29.34\n",
"14 1 33.56 40.37\n",
"15 1 37.91 22.92\n",
"16 1 31.81 47.56\n",
"17 0 25.88 44.58\n",
"18 1 38.40 47.85\n",
"19 0 26.62 25.50\n",
"20 0 14.36 21.87\n",
"21 1 22.22 20.79\n",
"22 1 32.10 51.56\n",
"23 0 11.75 32.96\n",
"24 1 10.32 48.59\n",
"25 0 11.43 34.78\n",
"26 0 12.58 33.27\n",
"27 0 27.53 25.63\n",
"28 1 36.71 37.05\n",
"29 0 17.85 26.86"
]
},
"execution_count": 60,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"mortgageDf"
]
},
{
"cell_type": "code",
"execution_count": 61,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "GQRNPIeyy6ub",
"outputId": "af3a1828-5bfb-4458-ee99-ecebf88ab76e"
},
"outputs": [
{
"data": {
"text/plain": [
"90"
]
},
"execution_count": 61,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"mortgageDf.size"
]
},
{
"cell_type": "code",
"execution_count": 62,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 300
},
"id": "yumMybniy85d",
"outputId": "e85c111d-108b-4a30-e3f1-cbcb8b515223"
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>y</th>\n",
" <th>x1</th>\n",
" <th>x2</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>30.000000</td>\n",
" <td>30.000000</td>\n",
" <td>30.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>0.566667</td>\n",
" <td>24.305667</td>\n",
" <td>37.819333</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>0.504007</td>\n",
" <td>9.842847</td>\n",
" <td>10.942216</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td>0.000000</td>\n",
" <td>10.120000</td>\n",
" <td>20.790000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td>0.000000</td>\n",
" <td>14.857500</td>\n",
" <td>29.372500</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td>1.000000</td>\n",
" <td>25.570000</td>\n",
" <td>36.260000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>1.000000</td>\n",
" <td>32.182500</td>\n",
" <td>47.777500</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td>1.000000</td>\n",
" <td>39.190000</td>\n",
" <td>56.880000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" y x1 x2\n",
"count 30.000000 30.000000 30.000000\n",
"mean 0.566667 24.305667 37.819333\n",
"std 0.504007 9.842847 10.942216\n",
"min 0.000000 10.120000 20.790000\n",
"25% 0.000000 14.857500 29.372500\n",
"50% 1.000000 25.570000 36.260000\n",
"75% 1.000000 32.182500 47.777500\n",
"max 1.000000 39.190000 56.880000"
]
},
"execution_count": 62,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"mortgageDf.describe()"
]
},
{
"cell_type": "code",
"execution_count": 63,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "aspq6hoPy_xZ",
"outputId": "6fa553af-188e-40f6-bf37-3a61224c5b0c"
},
"outputs": [
{
"data": {
"text/plain": [
"(30, 3)"
]
},
"execution_count": 63,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"mortgageDf.shape"
]
},
{
"cell_type": "code",
"execution_count": 64,
"metadata": {
"id": "z_hVTvPrzYJr"
},
"outputs": [],
"source": [
"import matplotlib.pyplot as plt"
]
},
{
"cell_type": "code",
"execution_count": 65,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 34
},
"id": "pIniVuaIzaaZ",
"outputId": "34e78f06-e2c7-4701-c78f-5aae99a9deb0"
},
"outputs": [
{
"data": {
"text/plain": [
"<Figure size 800x800 with 0 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Plotting\n",
"fig1 = plt.figure(\n",
" figsize=(8, 8)\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 66,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 449
},
"id": "VHdpDE7o42Pf",
"outputId": "e2532b62-f91f-4497-e2da-540c54f34f2f"
},
"outputs": [
{
"data": {
"image/png": "",
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"plt.scatter(\n",
" mortgageDf[\"x1\"],\n",
" mortgageDf[\"y\"],\n",
" color='blue',\n",
" alpha=0.9,\n",
" label='Data Points - scatter',\n",
")\n",
"\n",
"plt.xlabel('x1')\n",
"plt.ylabel('y')\n",
"plt.legend()\n",
"plt.grid(True)\n",
"\n",
"\n",
"\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "ean6vMxkWfHF"
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 67,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 449
},
"id": "knAa4W9R47rZ",
"outputId": "cb8121da-a185-417f-fa26-a0e9ad2b8faa"
},
"outputs": [
{
"data": {
"image/png": "",
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"plt.scatter(\n",
" mortgageDf[\"x2\"],\n",
" mortgageDf[\"y\"],\n",
" color='blue',\n",
" alpha=0.9,\n",
" label='Data Points - scatter',\n",
")\n",
"\n",
"plt.xlabel('x2')\n",
"plt.ylabel('y')\n",
"plt.legend()\n",
"plt.grid(True)\n",
"\n",
"\n",
"\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 68,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "alIhUPPUzvli",
"outputId": "8f9061b4-09dd-4525-f39e-797b603cfd53"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" OLS Regression Results \n",
"==============================================================================\n",
"Dep. Variable: y R-squared: 0.506\n",
"Model: OLS Adj. R-squared: 0.469\n",
"Method: Least Squares F-statistic: 13.82\n",
"Date: Sun, 09 Jun 2024 Prob (F-statistic): 7.37e-05\n",
"Time: 15:09:54 Log-Likelihood: -10.931\n",
"No. Observations: 30 AIC: 27.86\n",
"Df Residuals: 27 BIC: 32.07\n",
"Df Model: 2 \n",
"Covariance Type: nonrobust \n",
"==============================================================================\n",
" coef std err t P>|t| [0.025 0.975]\n",
"------------------------------------------------------------------------------\n",
"const -0.8682 0.281 -3.089 0.005 -1.445 -0.291\n",
"x1 0.0188 0.007 2.694 0.012 0.004 0.033\n",
"x2 0.0258 0.006 4.107 0.000 0.013 0.039\n",
"==============================================================================\n",
"Omnibus: 1.526 Durbin-Watson: 2.217\n",
"Prob(Omnibus): 0.466 Jarque-Bera (JB): 0.712\n",
"Skew: 0.357 Prob(JB): 0.700\n",
"Kurtosis: 3.247 Cond. No. 194.\n",
"==============================================================================\n",
"\n",
"Notes:\n",
"[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n"
]
}
],
"source": [
"model1 = sm.OLS(\n",
" mortgageDf[\"y\"],\n",
" sm.add_constant(mortgageDf[[\"x1\", \"x2\"]])\n",
")\n",
"model1Fit = model1.fit()\n",
"print(model1Fit.summary())"
]
},
{
"cell_type": "code",
"execution_count": 69,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 990
},
"id": "S-AyfiLN0Due",
"outputId": "827d6090-8431-46a4-fb36-c6e884539662"
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>y</th>\n",
" <th>x1</th>\n",
" <th>x2</th>\n",
" <th>predict1</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>16.35</td>\n",
" <td>49.94</td>\n",
" <td>0.729871</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>34.43</td>\n",
" <td>56.16</td>\n",
" <td>1.231162</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>39.19</td>\n",
" <td>36.89</td>\n",
" <td>0.823078</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>23.58</td>\n",
" <td>56.88</td>\n",
" <td>1.045349</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0</td>\n",
" <td>29.92</td>\n",
" <td>27.05</td>\n",
" <td>0.394258</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>1</td>\n",
" <td>25.26</td>\n",
" <td>44.38</td>\n",
" <td>0.754114</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>1</td>\n",
" <td>36.51</td>\n",
" <td>48.98</td>\n",
" <td>1.084883</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>1</td>\n",
" <td>11.70</td>\n",
" <td>55.55</td>\n",
" <td>0.787177</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>0</td>\n",
" <td>32.21</td>\n",
" <td>31.28</td>\n",
" <td>0.546666</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>1</td>\n",
" <td>28.74</td>\n",
" <td>35.63</td>\n",
" <td>0.593656</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>1</td>\n",
" <td>18.28</td>\n",
" <td>39.50</td>\n",
" <td>0.496558</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>0</td>\n",
" <td>10.12</td>\n",
" <td>31.39</td>\n",
" <td>0.133337</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>0</td>\n",
" <td>10.39</td>\n",
" <td>29.47</td>\n",
" <td>0.088829</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>0</td>\n",
" <td>21.46</td>\n",
" <td>29.34</td>\n",
" <td>0.294027</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>1</td>\n",
" <td>33.56</td>\n",
" <td>40.37</td>\n",
" <td>0.806902</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>1</td>\n",
" <td>37.91</td>\n",
" <td>22.92</td>\n",
" <td>0.438106</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>1</td>\n",
" <td>31.81</td>\n",
" <td>47.56</td>\n",
" <td>0.959656</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>0</td>\n",
" <td>25.88</td>\n",
" <td>44.58</td>\n",
" <td>0.770960</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>1</td>\n",
" <td>38.40</td>\n",
" <td>47.85</td>\n",
" <td>1.091301</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>0</td>\n",
" <td>26.62</td>\n",
" <td>25.50</td>\n",
" <td>0.292049</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20</th>\n",
" <td>0</td>\n",
" <td>14.36</td>\n",
" <td>21.87</td>\n",
" <td>-0.032692</td>\n",
" </tr>\n",
" <tr>\n",
" <th>21</th>\n",
" <td>1</td>\n",
" <td>22.22</td>\n",
" <td>20.79</td>\n",
" <td>0.087491</td>\n",
" </tr>\n",
" <tr>\n",
" <th>22</th>\n",
" <td>1</td>\n",
" <td>32.10</td>\n",
" <td>51.56</td>\n",
" <td>1.068443</td>\n",
" </tr>\n",
" <tr>\n",
" <th>23</th>\n",
" <td>0</td>\n",
" <td>11.75</td>\n",
" <td>32.96</td>\n",
" <td>0.204600</td>\n",
" </tr>\n",
" <tr>\n",
" <th>24</th>\n",
" <td>1</td>\n",
" <td>10.32</td>\n",
" <td>48.59</td>\n",
" <td>0.581396</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25</th>\n",
" <td>0</td>\n",
" <td>11.43</td>\n",
" <td>34.78</td>\n",
" <td>0.245584</td>\n",
" </tr>\n",
" <tr>\n",
" <th>26</th>\n",
" <td>0</td>\n",
" <td>12.58</td>\n",
" <td>33.27</td>\n",
" <td>0.228245</td>\n",
" </tr>\n",
" <tr>\n",
" <th>27</th>\n",
" <td>0</td>\n",
" <td>27.53</td>\n",
" <td>25.63</td>\n",
" <td>0.312551</td>\n",
" </tr>\n",
" <tr>\n",
" <th>28</th>\n",
" <td>1</td>\n",
" <td>36.71</td>\n",
" <td>37.05</td>\n",
" <td>0.780489</td>\n",
" </tr>\n",
" <tr>\n",
" <th>29</th>\n",
" <td>0</td>\n",
" <td>17.85</td>\n",
" <td>26.86</td>\n",
" <td>0.161955</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" y x1 x2 predict1\n",
"0 1 16.35 49.94 0.729871\n",
"1 1 34.43 56.16 1.231162\n",
"2 1 39.19 36.89 0.823078\n",
"3 1 23.58 56.88 1.045349\n",
"4 0 29.92 27.05 0.394258\n",
"5 1 25.26 44.38 0.754114\n",
"6 1 36.51 48.98 1.084883\n",
"7 1 11.70 55.55 0.787177\n",
"8 0 32.21 31.28 0.546666\n",
"9 1 28.74 35.63 0.593656\n",
"10 1 18.28 39.50 0.496558\n",
"11 0 10.12 31.39 0.133337\n",
"12 0 10.39 29.47 0.088829\n",
"13 0 21.46 29.34 0.294027\n",
"14 1 33.56 40.37 0.806902\n",
"15 1 37.91 22.92 0.438106\n",
"16 1 31.81 47.56 0.959656\n",
"17 0 25.88 44.58 0.770960\n",
"18 1 38.40 47.85 1.091301\n",
"19 0 26.62 25.50 0.292049\n",
"20 0 14.36 21.87 -0.032692\n",
"21 1 22.22 20.79 0.087491\n",
"22 1 32.10 51.56 1.068443\n",
"23 0 11.75 32.96 0.204600\n",
"24 1 10.32 48.59 0.581396\n",
"25 0 11.43 34.78 0.245584\n",
"26 0 12.58 33.27 0.228245\n",
"27 0 27.53 25.63 0.312551\n",
"28 1 36.71 37.05 0.780489\n",
"29 0 17.85 26.86 0.161955"
]
},
"execution_count": 69,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"predict1 = model1Fit.predict(sm.add_constant(mortgageDf[[\"x1\", \"x2\"]]))\n",
"mortgageDf['predict1'] = predict1\n",
"mortgageDf"
]
},
{
"cell_type": "code",
"execution_count": 70,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "9ouX-mzz4sl-",
"outputId": "6f95fccb-ab1c-4fef-a53f-d744ad00a45b"
},
"outputs": [
{
"data": {
"text/plain": [
"array([0.28356899])"
]
},
"execution_count": 70,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model1Fit.predict([[1, 20, 30]])"
]
},
{
"cell_type": "code",
"execution_count": 71,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "ujSQIAwa8DRG",
"outputId": "ff3d1a58-32f5-4bef-cb57-97ab79bbdd53"
},
"outputs": [
{
"data": {
"text/plain": [
"array([-0.10389379])"
]
},
"execution_count": 71,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model1Fit.predict([[1, 20, 15]])"
]
},
{
"cell_type": "code",
"execution_count": 72,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "yQ8XuYfr8Fs6",
"outputId": "06169e06-16e7-44be-c599-ba17fceb48ca"
},
"outputs": [
{
"data": {
"text/plain": [
"array([1.17698081])"
]
},
"execution_count": 72,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model1Fit.predict([[1, 40, 50]])"
]
},
{
"cell_type": "code",
"execution_count": 73,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "l_wGiUmL9Xta",
"outputId": "9beb1054-bd82-4438-b3f7-d115d51a8b88"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Optimization terminated successfully.\n",
" Current function value: 0.363910\n",
" Iterations 7\n",
" Logit Regression Results \n",
"==============================================================================\n",
"Dep. Variable: y No. Observations: 30\n",
"Model: Logit Df Residuals: 27\n",
"Method: MLE Df Model: 2\n",
"Date: Sun, 09 Jun 2024 Pseudo R-squ.: 0.4681\n",
"Time: 15:09:54 Log-Likelihood: -10.917\n",
"converged: True LL-Null: -20.527\n",
"Covariance Type: nonrobust LLR p-value: 6.708e-05\n",
"==============================================================================\n",
" coef std err z P>|z| [0.025 0.975]\n",
"------------------------------------------------------------------------------\n",
"const -9.3671 3.196 -2.931 0.003 -15.631 -3.103\n",
"x1 0.1349 0.064 2.107 0.035 0.009 0.260\n",
"x2 0.1782 0.065 2.758 0.006 0.052 0.305\n",
"==============================================================================\n"
]
}
],
"source": [
"model2 = sm.Logit(\n",
" mortgageDf[\"y\"],\n",
" sm.add_constant(mortgageDf[[\"x1\", \"x2\"]])\n",
")\n",
"model2Fit = model2.fit()\n",
"print(model2Fit.summary())"
]
},
{
"cell_type": "code",
"execution_count": 74,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 990
},
"id": "hICJCcTx9gKy",
"outputId": "6d072132-6408-4df8-ac73-75bb7a7bd6b2"
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>y</th>\n",
" <th>x1</th>\n",
" <th>x2</th>\n",
" <th>predict1</th>\n",
" <th>predict2</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>16.35</td>\n",
" <td>49.94</td>\n",
" <td>0.729871</td>\n",
" <td>0.850564</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>34.43</td>\n",
" <td>56.16</td>\n",
" <td>1.231162</td>\n",
" <td>0.994966</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>39.19</td>\n",
" <td>36.89</td>\n",
" <td>0.823078</td>\n",
" <td>0.923739</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>23.58</td>\n",
" <td>56.88</td>\n",
" <td>1.045349</td>\n",
" <td>0.981132</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0</td>\n",
" <td>29.92</td>\n",
" <td>27.05</td>\n",
" <td>0.394258</td>\n",
" <td>0.375201</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>1</td>\n",
" <td>25.26</td>\n",
" <td>44.38</td>\n",
" <td>0.754114</td>\n",
" <td>0.875451</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>1</td>\n",
" <td>36.51</td>\n",
" <td>48.98</td>\n",
" <td>1.084883</td>\n",
" <td>0.986447</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>1</td>\n",
" <td>11.70</td>\n",
" <td>55.55</td>\n",
" <td>0.787177</td>\n",
" <td>0.892025</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>0</td>\n",
" <td>32.21</td>\n",
" <td>31.28</td>\n",
" <td>0.546666</td>\n",
" <td>0.634794</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>1</td>\n",
" <td>28.74</td>\n",
" <td>35.63</td>\n",
" <td>0.593656</td>\n",
" <td>0.702665</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>1</td>\n",
" <td>18.28</td>\n",
" <td>39.50</td>\n",
" <td>0.496558</td>\n",
" <td>0.534624</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>0</td>\n",
" <td>10.12</td>\n",
" <td>31.39</td>\n",
" <td>0.133337</td>\n",
" <td>0.082606</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>0</td>\n",
" <td>10.39</td>\n",
" <td>29.47</td>\n",
" <td>0.088829</td>\n",
" <td>0.062198</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>0</td>\n",
" <td>21.46</td>\n",
" <td>29.34</td>\n",
" <td>0.294027</td>\n",
" <td>0.223902</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>1</td>\n",
" <td>33.56</td>\n",
" <td>40.37</td>\n",
" <td>0.806902</td>\n",
" <td>0.913332</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>1</td>\n",
" <td>37.91</td>\n",
" <td>22.92</td>\n",
" <td>0.438106</td>\n",
" <td>0.458048</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>1</td>\n",
" <td>31.81</td>\n",
" <td>47.56</td>\n",
" <td>0.959656</td>\n",
" <td>0.967716</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>0</td>\n",
" <td>25.88</td>\n",
" <td>44.58</td>\n",
" <td>0.770960</td>\n",
" <td>0.887885</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>1</td>\n",
" <td>38.40</td>\n",
" <td>47.85</td>\n",
" <td>1.091301</td>\n",
" <td>0.987144</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>0</td>\n",
" <td>26.62</td>\n",
" <td>25.50</td>\n",
" <td>0.292049</td>\n",
" <td>0.225940</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20</th>\n",
" <td>0</td>\n",
" <td>14.36</td>\n",
" <td>21.87</td>\n",
" <td>-0.032692</td>\n",
" <td>0.028410</td>\n",
" </tr>\n",
" <tr>\n",
" <th>21</th>\n",
" <td>1</td>\n",
" <td>22.22</td>\n",
" <td>20.79</td>\n",
" <td>0.087491</td>\n",
" <td>0.065109</td>\n",
" </tr>\n",
" <tr>\n",
" <th>22</th>\n",
" <td>1</td>\n",
" <td>32.10</td>\n",
" <td>51.56</td>\n",
" <td>1.068443</td>\n",
" <td>0.984517</td>\n",
" </tr>\n",
" <tr>\n",
" <th>23</th>\n",
" <td>0</td>\n",
" <td>11.75</td>\n",
" <td>32.96</td>\n",
" <td>0.204600</td>\n",
" <td>0.129233</td>\n",
" </tr>\n",
" <tr>\n",
" <th>24</th>\n",
" <td>1</td>\n",
" <td>10.32</td>\n",
" <td>48.59</td>\n",
" <td>0.581396</td>\n",
" <td>0.664852</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25</th>\n",
" <td>0</td>\n",
" <td>11.43</td>\n",
" <td>34.78</td>\n",
" <td>0.245584</td>\n",
" <td>0.164303</td>\n",
" </tr>\n",
" <tr>\n",
" <th>26</th>\n",
" <td>0</td>\n",
" <td>12.58</td>\n",
" <td>33.27</td>\n",
" <td>0.228245</td>\n",
" <td>0.149244</td>\n",
" </tr>\n",
" <tr>\n",
" <th>27</th>\n",
" <td>0</td>\n",
" <td>27.53</td>\n",
" <td>25.63</td>\n",
" <td>0.312551</td>\n",
" <td>0.252476</td>\n",
" </tr>\n",
" <tr>\n",
" <th>28</th>\n",
" <td>1</td>\n",
" <td>36.71</td>\n",
" <td>37.05</td>\n",
" <td>0.780489</td>\n",
" <td>0.899188</td>\n",
" </tr>\n",
" <tr>\n",
" <th>29</th>\n",
" <td>0</td>\n",
" <td>17.85</td>\n",
" <td>26.86</td>\n",
" <td>0.161955</td>\n",
" <td>0.102289</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" y x1 x2 predict1 predict2\n",
"0 1 16.35 49.94 0.729871 0.850564\n",
"1 1 34.43 56.16 1.231162 0.994966\n",
"2 1 39.19 36.89 0.823078 0.923739\n",
"3 1 23.58 56.88 1.045349 0.981132\n",
"4 0 29.92 27.05 0.394258 0.375201\n",
"5 1 25.26 44.38 0.754114 0.875451\n",
"6 1 36.51 48.98 1.084883 0.986447\n",
"7 1 11.70 55.55 0.787177 0.892025\n",
"8 0 32.21 31.28 0.546666 0.634794\n",
"9 1 28.74 35.63 0.593656 0.702665\n",
"10 1 18.28 39.50 0.496558 0.534624\n",
"11 0 10.12 31.39 0.133337 0.082606\n",
"12 0 10.39 29.47 0.088829 0.062198\n",
"13 0 21.46 29.34 0.294027 0.223902\n",
"14 1 33.56 40.37 0.806902 0.913332\n",
"15 1 37.91 22.92 0.438106 0.458048\n",
"16 1 31.81 47.56 0.959656 0.967716\n",
"17 0 25.88 44.58 0.770960 0.887885\n",
"18 1 38.40 47.85 1.091301 0.987144\n",
"19 0 26.62 25.50 0.292049 0.225940\n",
"20 0 14.36 21.87 -0.032692 0.028410\n",
"21 1 22.22 20.79 0.087491 0.065109\n",
"22 1 32.10 51.56 1.068443 0.984517\n",
"23 0 11.75 32.96 0.204600 0.129233\n",
"24 1 10.32 48.59 0.581396 0.664852\n",
"25 0 11.43 34.78 0.245584 0.164303\n",
"26 0 12.58 33.27 0.228245 0.149244\n",
"27 0 27.53 25.63 0.312551 0.252476\n",
"28 1 36.71 37.05 0.780489 0.899188\n",
"29 0 17.85 26.86 0.161955 0.102289"
]
},
"execution_count": 74,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"predict2 = model2Fit.predict(sm.add_constant(mortgageDf[[\"x1\", \"x2\"]]))\n",
"mortgageDf['predict2'] = predict2\n",
"mortgageDf"
]
},
{
"cell_type": "code",
"execution_count": 75,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "tBfMgF0Y9usy",
"outputId": "29db0b46-acbc-42c5-ab11-4490a2eecc47"
},
"outputs": [
{
"data": {
"text/plain": [
"(array([0.21042055]), array([0.01806123]), array([0.99289663]))"
]
},
"execution_count": 75,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model2Fit.predict([[1, 20, 30]]), model2Fit.predict([[1, 20, 15]]), model2Fit.predict([[1, 40, 50]])"
]
},
{
"cell_type": "code",
"execution_count": 76,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "iLB_t1-lWjAn",
"outputId": "77dc990d-db61-4e4a-e26e-0593cadeb631"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" OLS Regression Results \n",
"==============================================================================\n",
"Dep. Variable: y R-squared: 0.197\n",
"Model: OLS Adj. R-squared: 0.168\n",
"Method: Least Squares F-statistic: 6.875\n",
"Date: Sun, 09 Jun 2024 Prob (F-statistic): 0.0140\n",
"Time: 15:09:54 Log-Likelihood: -18.211\n",
"No. Observations: 30 AIC: 40.42\n",
"Df Residuals: 28 BIC: 43.23\n",
"Df Model: 1 \n",
"Covariance Type: nonrobust \n",
"==============================================================================\n",
" coef std err t P>|t| [0.025 0.975]\n",
"------------------------------------------------------------------------------\n",
"const 0.0141 0.227 0.062 0.951 -0.451 0.479\n",
"x1 0.0227 0.009 2.622 0.014 0.005 0.040\n",
"==============================================================================\n",
"Omnibus: 5.223 Durbin-Watson: 2.358\n",
"Prob(Omnibus): 0.073 Jarque-Bera (JB): 1.806\n",
"Skew: -0.084 Prob(JB): 0.405\n",
"Kurtosis: 1.810 Cond. No. 70.8\n",
"==============================================================================\n",
"\n",
"Notes:\n",
"[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n"
]
}
],
"source": [
"model3 = sm.OLS(\n",
" mortgageDf[\"y\"],\n",
" sm.add_constant(mortgageDf[[\"x1\"]])\n",
")\n",
"model3Fit = model3.fit()\n",
"print(model3Fit.summary())"
]
},
{
"cell_type": "code",
"execution_count": 77,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "PuixWL0hWw1e",
"outputId": "237434c6-c5eb-4ccd-a39e-d73fb6f1d215"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Optimization terminated successfully.\n",
" Current function value: 0.579907\n",
" Iterations 5\n",
" Logit Regression Results \n",
"==============================================================================\n",
"Dep. Variable: y No. Observations: 30\n",
"Model: Logit Df Residuals: 28\n",
"Method: MLE Df Model: 1\n",
"Date: Sun, 09 Jun 2024 Pseudo R-squ.: 0.1525\n",
"Time: 15:09:54 Log-Likelihood: -17.397\n",
"converged: True LL-Null: -20.527\n",
"Covariance Type: nonrobust LLR p-value: 0.01235\n",
"==============================================================================\n",
" coef std err z P>|z| [0.025 0.975]\n",
"------------------------------------------------------------------------------\n",
"const -2.2077 1.140 -1.936 0.053 -4.442 0.027\n",
"x1 0.1043 0.046 2.282 0.022 0.015 0.194\n",
"==============================================================================\n"
]
}
],
"source": [
"model4 = sm.Logit(\n",
" mortgageDf[\"y\"],\n",
" sm.add_constant(mortgageDf[[\"x1\"]])\n",
")\n",
"model4Fit = model4.fit()\n",
"print(model4Fit.summary())"
]
},
{
"cell_type": "code",
"execution_count": 78,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "YLIrig6rXKhw",
"outputId": "211239e6-b133-460b-fa77-c68169153bfa"
},
"outputs": [
{
"data": {
"text/plain": [
"0"
]
},
"execution_count": 78,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"min = 0\n",
"min"
]
},
{
"cell_type": "code",
"execution_count": 79,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "w5OmNUfaXNsk",
"outputId": "14e9ae85-7cd1-47fd-a370-6904d3b170d5"
},
"outputs": [
{
"data": {
"text/plain": [
"(49.19, 30)"
]
},
"execution_count": 79,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"max = mortgageDf[\"x1\"].max() + 10\n",
"max, len(mortgageDf[\"x1\"])"
]
},
{
"cell_type": "code",
"execution_count": 80,
"metadata": {
"id": "sBBshZgnXQzO"
},
"outputs": [],
"source": [
"x = np.linspace(min - 5, max + 5, 500)\n",
"# x"
]
},
{
"cell_type": "code",
"execution_count": 81,
"metadata": {
"id": "2zxAZeT5XwgE"
},
"outputs": [],
"source": [
"import math"
]
},
{
"cell_type": "code",
"execution_count": 82,
"metadata": {
"id": "X2BmYiiDXgbw"
},
"outputs": [],
"source": [
"lREq = 0.0141 + x * 0.0227\n",
"logREq = pow(math.e, (-2.2077 + 0.1043 * x))/ (1+ pow(math.e, (-2.2077 + 0.1043 * x)))"
]
},
{
"cell_type": "code",
"execution_count": 83,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "C_8MU1W7YgR8",
"outputId": "33939ed7-8fc1-4813-bb7c-06c6f792c694"
},
"outputs": [
{
"data": {
"text/plain": [
"500"
]
},
"execution_count": 83,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(lREq)"
]
},
{
"cell_type": "code",
"execution_count": 84,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 449
},
"id": "VZ6BxMqpXCOH",
"outputId": "b11dad1f-c306-46d0-fe8b-afc16e3f91f2"
},
"outputs": [
{
"data": {
"image/png": "",
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"\n",
"plt.scatter(\n",
" mortgageDf[\"x1\"],\n",
" mortgageDf[\"y\"],\n",
" color='blue',\n",
" alpha=0.9,\n",
" label='Data Points - scatter',\n",
")\n",
"\n",
"plt.plot(\n",
" x,\n",
" lREq,\n",
" color='red',\n",
" alpha=0.9,\n",
" label='lREq',\n",
")\n",
"\n",
"plt.plot(\n",
" x,\n",
" logREq,\n",
" color='green',\n",
" alpha=0.9,\n",
" label='logREq',\n",
")\n",
"\n",
"plt.xlabel('x1')\n",
"plt.ylabel('y')\n",
"plt.legend()\n",
"plt.grid(True)\n",
"\n",
"\n",
"\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 85,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 990
},
"id": "NqT4-52vZyo-",
"outputId": "96763409-58d1-4435-bc54-afcf8fc9d05f"
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>y</th>\n",
" <th>x1</th>\n",
" <th>x2</th>\n",
" <th>predict1</th>\n",
" <th>predict2</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>16.35</td>\n",
" <td>49.94</td>\n",
" <td>0.729871</td>\n",
" <td>0.850564</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>34.43</td>\n",
" <td>56.16</td>\n",
" <td>1.231162</td>\n",
" <td>0.994966</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>39.19</td>\n",
" <td>36.89</td>\n",
" <td>0.823078</td>\n",
" <td>0.923739</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>23.58</td>\n",
" <td>56.88</td>\n",
" <td>1.045349</td>\n",
" <td>0.981132</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0</td>\n",
" <td>29.92</td>\n",
" <td>27.05</td>\n",
" <td>0.394258</td>\n",
" <td>0.375201</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>1</td>\n",
" <td>25.26</td>\n",
" <td>44.38</td>\n",
" <td>0.754114</td>\n",
" <td>0.875451</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>1</td>\n",
" <td>36.51</td>\n",
" <td>48.98</td>\n",
" <td>1.084883</td>\n",
" <td>0.986447</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>1</td>\n",
" <td>11.70</td>\n",
" <td>55.55</td>\n",
" <td>0.787177</td>\n",
" <td>0.892025</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>0</td>\n",
" <td>32.21</td>\n",
" <td>31.28</td>\n",
" <td>0.546666</td>\n",
" <td>0.634794</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>1</td>\n",
" <td>28.74</td>\n",
" <td>35.63</td>\n",
" <td>0.593656</td>\n",
" <td>0.702665</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>1</td>\n",
" <td>18.28</td>\n",
" <td>39.50</td>\n",
" <td>0.496558</td>\n",
" <td>0.534624</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>0</td>\n",
" <td>10.12</td>\n",
" <td>31.39</td>\n",
" <td>0.133337</td>\n",
" <td>0.082606</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>0</td>\n",
" <td>10.39</td>\n",
" <td>29.47</td>\n",
" <td>0.088829</td>\n",
" <td>0.062198</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>0</td>\n",
" <td>21.46</td>\n",
" <td>29.34</td>\n",
" <td>0.294027</td>\n",
" <td>0.223902</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>1</td>\n",
" <td>33.56</td>\n",
" <td>40.37</td>\n",
" <td>0.806902</td>\n",
" <td>0.913332</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>1</td>\n",
" <td>37.91</td>\n",
" <td>22.92</td>\n",
" <td>0.438106</td>\n",
" <td>0.458048</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>1</td>\n",
" <td>31.81</td>\n",
" <td>47.56</td>\n",
" <td>0.959656</td>\n",
" <td>0.967716</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>0</td>\n",
" <td>25.88</td>\n",
" <td>44.58</td>\n",
" <td>0.770960</td>\n",
" <td>0.887885</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>1</td>\n",
" <td>38.40</td>\n",
" <td>47.85</td>\n",
" <td>1.091301</td>\n",
" <td>0.987144</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>0</td>\n",
" <td>26.62</td>\n",
" <td>25.50</td>\n",
" <td>0.292049</td>\n",
" <td>0.225940</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20</th>\n",
" <td>0</td>\n",
" <td>14.36</td>\n",
" <td>21.87</td>\n",
" <td>-0.032692</td>\n",
" <td>0.028410</td>\n",
" </tr>\n",
" <tr>\n",
" <th>21</th>\n",
" <td>1</td>\n",
" <td>22.22</td>\n",
" <td>20.79</td>\n",
" <td>0.087491</td>\n",
" <td>0.065109</td>\n",
" </tr>\n",
" <tr>\n",
" <th>22</th>\n",
" <td>1</td>\n",
" <td>32.10</td>\n",
" <td>51.56</td>\n",
" <td>1.068443</td>\n",
" <td>0.984517</td>\n",
" </tr>\n",
" <tr>\n",
" <th>23</th>\n",
" <td>0</td>\n",
" <td>11.75</td>\n",
" <td>32.96</td>\n",
" <td>0.204600</td>\n",
" <td>0.129233</td>\n",
" </tr>\n",
" <tr>\n",
" <th>24</th>\n",
" <td>1</td>\n",
" <td>10.32</td>\n",
" <td>48.59</td>\n",
" <td>0.581396</td>\n",
" <td>0.664852</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25</th>\n",
" <td>0</td>\n",
" <td>11.43</td>\n",
" <td>34.78</td>\n",
" <td>0.245584</td>\n",
" <td>0.164303</td>\n",
" </tr>\n",
" <tr>\n",
" <th>26</th>\n",
" <td>0</td>\n",
" <td>12.58</td>\n",
" <td>33.27</td>\n",
" <td>0.228245</td>\n",
" <td>0.149244</td>\n",
" </tr>\n",
" <tr>\n",
" <th>27</th>\n",
" <td>0</td>\n",
" <td>27.53</td>\n",
" <td>25.63</td>\n",
" <td>0.312551</td>\n",
" <td>0.252476</td>\n",
" </tr>\n",
" <tr>\n",
" <th>28</th>\n",
" <td>1</td>\n",
" <td>36.71</td>\n",
" <td>37.05</td>\n",
" <td>0.780489</td>\n",
" <td>0.899188</td>\n",
" </tr>\n",
" <tr>\n",
" <th>29</th>\n",
" <td>0</td>\n",
" <td>17.85</td>\n",
" <td>26.86</td>\n",
" <td>0.161955</td>\n",
" <td>0.102289</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" y x1 x2 predict1 predict2\n",
"0 1 16.35 49.94 0.729871 0.850564\n",
"1 1 34.43 56.16 1.231162 0.994966\n",
"2 1 39.19 36.89 0.823078 0.923739\n",
"3 1 23.58 56.88 1.045349 0.981132\n",
"4 0 29.92 27.05 0.394258 0.375201\n",
"5 1 25.26 44.38 0.754114 0.875451\n",
"6 1 36.51 48.98 1.084883 0.986447\n",
"7 1 11.70 55.55 0.787177 0.892025\n",
"8 0 32.21 31.28 0.546666 0.634794\n",
"9 1 28.74 35.63 0.593656 0.702665\n",
"10 1 18.28 39.50 0.496558 0.534624\n",
"11 0 10.12 31.39 0.133337 0.082606\n",
"12 0 10.39 29.47 0.088829 0.062198\n",
"13 0 21.46 29.34 0.294027 0.223902\n",
"14 1 33.56 40.37 0.806902 0.913332\n",
"15 1 37.91 22.92 0.438106 0.458048\n",
"16 1 31.81 47.56 0.959656 0.967716\n",
"17 0 25.88 44.58 0.770960 0.887885\n",
"18 1 38.40 47.85 1.091301 0.987144\n",
"19 0 26.62 25.50 0.292049 0.225940\n",
"20 0 14.36 21.87 -0.032692 0.028410\n",
"21 1 22.22 20.79 0.087491 0.065109\n",
"22 1 32.10 51.56 1.068443 0.984517\n",
"23 0 11.75 32.96 0.204600 0.129233\n",
"24 1 10.32 48.59 0.581396 0.664852\n",
"25 0 11.43 34.78 0.245584 0.164303\n",
"26 0 12.58 33.27 0.228245 0.149244\n",
"27 0 27.53 25.63 0.312551 0.252476\n",
"28 1 36.71 37.05 0.780489 0.899188\n",
"29 0 17.85 26.86 0.161955 0.102289"
]
},
"execution_count": 85,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"mortgageDf"
]
},
{
"cell_type": "code",
"execution_count": 86,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 990
},
"id": "jLJD6VGqY1Xc",
"outputId": "dacd3002-ceb1-4b40-e315-9adba673b28b"
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>y</th>\n",
" <th>x1</th>\n",
" <th>x2</th>\n",
" <th>predict1</th>\n",
" <th>predict2</th>\n",
" <th>yHat2</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>16.35</td>\n",
" <td>49.94</td>\n",
" <td>0.729871</td>\n",
" <td>0.850564</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>34.43</td>\n",
" <td>56.16</td>\n",
" <td>1.231162</td>\n",
" <td>0.994966</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>39.19</td>\n",
" <td>36.89</td>\n",
" <td>0.823078</td>\n",
" <td>0.923739</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>23.58</td>\n",
" <td>56.88</td>\n",
" <td>1.045349</td>\n",
" <td>0.981132</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0</td>\n",
" <td>29.92</td>\n",
" <td>27.05</td>\n",
" <td>0.394258</td>\n",
" <td>0.375201</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>1</td>\n",
" <td>25.26</td>\n",
" <td>44.38</td>\n",
" <td>0.754114</td>\n",
" <td>0.875451</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>1</td>\n",
" <td>36.51</td>\n",
" <td>48.98</td>\n",
" <td>1.084883</td>\n",
" <td>0.986447</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>1</td>\n",
" <td>11.70</td>\n",
" <td>55.55</td>\n",
" <td>0.787177</td>\n",
" <td>0.892025</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>0</td>\n",
" <td>32.21</td>\n",
" <td>31.28</td>\n",
" <td>0.546666</td>\n",
" <td>0.634794</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>1</td>\n",
" <td>28.74</td>\n",
" <td>35.63</td>\n",
" <td>0.593656</td>\n",
" <td>0.702665</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>1</td>\n",
" <td>18.28</td>\n",
" <td>39.50</td>\n",
" <td>0.496558</td>\n",
" <td>0.534624</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>0</td>\n",
" <td>10.12</td>\n",
" <td>31.39</td>\n",
" <td>0.133337</td>\n",
" <td>0.082606</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>0</td>\n",
" <td>10.39</td>\n",
" <td>29.47</td>\n",
" <td>0.088829</td>\n",
" <td>0.062198</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>0</td>\n",
" <td>21.46</td>\n",
" <td>29.34</td>\n",
" <td>0.294027</td>\n",
" <td>0.223902</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>1</td>\n",
" <td>33.56</td>\n",
" <td>40.37</td>\n",
" <td>0.806902</td>\n",
" <td>0.913332</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>1</td>\n",
" <td>37.91</td>\n",
" <td>22.92</td>\n",
" <td>0.438106</td>\n",
" <td>0.458048</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>1</td>\n",
" <td>31.81</td>\n",
" <td>47.56</td>\n",
" <td>0.959656</td>\n",
" <td>0.967716</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>0</td>\n",
" <td>25.88</td>\n",
" <td>44.58</td>\n",
" <td>0.770960</td>\n",
" <td>0.887885</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>1</td>\n",
" <td>38.40</td>\n",
" <td>47.85</td>\n",
" <td>1.091301</td>\n",
" <td>0.987144</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>0</td>\n",
" <td>26.62</td>\n",
" <td>25.50</td>\n",
" <td>0.292049</td>\n",
" <td>0.225940</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20</th>\n",
" <td>0</td>\n",
" <td>14.36</td>\n",
" <td>21.87</td>\n",
" <td>-0.032692</td>\n",
" <td>0.028410</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>21</th>\n",
" <td>1</td>\n",
" <td>22.22</td>\n",
" <td>20.79</td>\n",
" <td>0.087491</td>\n",
" <td>0.065109</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>22</th>\n",
" <td>1</td>\n",
" <td>32.10</td>\n",
" <td>51.56</td>\n",
" <td>1.068443</td>\n",
" <td>0.984517</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>23</th>\n",
" <td>0</td>\n",
" <td>11.75</td>\n",
" <td>32.96</td>\n",
" <td>0.204600</td>\n",
" <td>0.129233</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>24</th>\n",
" <td>1</td>\n",
" <td>10.32</td>\n",
" <td>48.59</td>\n",
" <td>0.581396</td>\n",
" <td>0.664852</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25</th>\n",
" <td>0</td>\n",
" <td>11.43</td>\n",
" <td>34.78</td>\n",
" <td>0.245584</td>\n",
" <td>0.164303</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>26</th>\n",
" <td>0</td>\n",
" <td>12.58</td>\n",
" <td>33.27</td>\n",
" <td>0.228245</td>\n",
" <td>0.149244</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>27</th>\n",
" <td>0</td>\n",
" <td>27.53</td>\n",
" <td>25.63</td>\n",
" <td>0.312551</td>\n",
" <td>0.252476</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>28</th>\n",
" <td>1</td>\n",
" <td>36.71</td>\n",
" <td>37.05</td>\n",
" <td>0.780489</td>\n",
" <td>0.899188</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>29</th>\n",
" <td>0</td>\n",
" <td>17.85</td>\n",
" <td>26.86</td>\n",
" <td>0.161955</td>\n",
" <td>0.102289</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" y x1 x2 predict1 predict2 yHat2\n",
"0 1 16.35 49.94 0.729871 0.850564 1\n",
"1 1 34.43 56.16 1.231162 0.994966 1\n",
"2 1 39.19 36.89 0.823078 0.923739 1\n",
"3 1 23.58 56.88 1.045349 0.981132 1\n",
"4 0 29.92 27.05 0.394258 0.375201 0\n",
"5 1 25.26 44.38 0.754114 0.875451 1\n",
"6 1 36.51 48.98 1.084883 0.986447 1\n",
"7 1 11.70 55.55 0.787177 0.892025 1\n",
"8 0 32.21 31.28 0.546666 0.634794 1\n",
"9 1 28.74 35.63 0.593656 0.702665 1\n",
"10 1 18.28 39.50 0.496558 0.534624 1\n",
"11 0 10.12 31.39 0.133337 0.082606 0\n",
"12 0 10.39 29.47 0.088829 0.062198 0\n",
"13 0 21.46 29.34 0.294027 0.223902 0\n",
"14 1 33.56 40.37 0.806902 0.913332 1\n",
"15 1 37.91 22.92 0.438106 0.458048 0\n",
"16 1 31.81 47.56 0.959656 0.967716 1\n",
"17 0 25.88 44.58 0.770960 0.887885 1\n",
"18 1 38.40 47.85 1.091301 0.987144 1\n",
"19 0 26.62 25.50 0.292049 0.225940 0\n",
"20 0 14.36 21.87 -0.032692 0.028410 0\n",
"21 1 22.22 20.79 0.087491 0.065109 0\n",
"22 1 32.10 51.56 1.068443 0.984517 1\n",
"23 0 11.75 32.96 0.204600 0.129233 0\n",
"24 1 10.32 48.59 0.581396 0.664852 1\n",
"25 0 11.43 34.78 0.245584 0.164303 0\n",
"26 0 12.58 33.27 0.228245 0.149244 0\n",
"27 0 27.53 25.63 0.312551 0.252476 0\n",
"28 1 36.71 37.05 0.780489 0.899188 1\n",
"29 0 17.85 26.86 0.161955 0.102289 0"
]
},
"execution_count": 86,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"mortgageDf['yHat2'] = mortgageDf['predict2'].apply(lambda x: 1 if x > 0.5 else 0)\n",
"mortgageDf"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "HR8tECzUY1NR"
},
"source": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "FRq8XsHGYvyA"
},
"source": [
"Hold-out"
]
},
{
"cell_type": "code",
"execution_count": 87,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 206
},
"id": "ugSmcm30aIO2",
"outputId": "d71903ab-b450-476a-b05f-9f5230920e9e"
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>y</th>\n",
" <th>x1</th>\n",
" <th>x2</th>\n",
" <th>predict1</th>\n",
" <th>predict2</th>\n",
" <th>yHat2</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>16.35</td>\n",
" <td>49.94</td>\n",
" <td>0.729871</td>\n",
" <td>0.850564</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>0</td>\n",
" <td>10.12</td>\n",
" <td>31.39</td>\n",
" <td>0.133337</td>\n",
" <td>0.082606</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>1</td>\n",
" <td>36.51</td>\n",
" <td>48.98</td>\n",
" <td>1.084883</td>\n",
" <td>0.986447</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>24</th>\n",
" <td>1</td>\n",
" <td>10.32</td>\n",
" <td>48.59</td>\n",
" <td>0.581396</td>\n",
" <td>0.664852</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>0</td>\n",
" <td>25.88</td>\n",
" <td>44.58</td>\n",
" <td>0.770960</td>\n",
" <td>0.887885</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" y x1 x2 predict1 predict2 yHat2\n",
"0 1 16.35 49.94 0.729871 0.850564 1\n",
"11 0 10.12 31.39 0.133337 0.082606 0\n",
"6 1 36.51 48.98 1.084883 0.986447 1\n",
"24 1 10.32 48.59 0.581396 0.664852 1\n",
"17 0 25.88 44.58 0.770960 0.887885 1"
]
},
"execution_count": 87,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn.model_selection import train_test_split\n",
"# Split the data into train and test sets\n",
"# trainSet, testSet = train_test_split(wagesDf, test_size=0.15, random_state=55)\n",
"trainSet, testSet = train_test_split(mortgageDf, test_size=0.15)\n",
"\n",
"trainSet.head()"
]
},
{
"cell_type": "code",
"execution_count": 88,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "8gQc9YQqaQ0G",
"outputId": "6a67b0ce-9f65-4711-8b45-d27929aad16b"
},
"outputs": [
{
"data": {
"text/plain": [
"((30, 6), (25, 6), (5, 6))"
]
},
"execution_count": 88,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"mortgageDf.shape, trainSet.shape, testSet.shape"
]
},
{
"cell_type": "code",
"execution_count": 89,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "4Lz67KCiaQtC",
"outputId": "4dbed20d-28d4-470a-d898-df6f4cd557d7"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Optimization terminated successfully.\n",
" Current function value: 0.379717\n",
" Iterations 7\n",
" Logit Regression Results \n",
"==============================================================================\n",
"Dep. Variable: y No. Observations: 25\n",
"Model: Logit Df Residuals: 22\n",
"Method: MLE Df Model: 2\n",
"Date: Sun, 09 Jun 2024 Pseudo R-squ.: 0.4464\n",
"Time: 15:09:54 Log-Likelihood: -9.4929\n",
"converged: True LL-Null: -17.148\n",
"Covariance Type: nonrobust LLR p-value: 0.0004735\n",
"==============================================================================\n",
" coef std err z P>|z| [0.025 0.975]\n",
"------------------------------------------------------------------------------\n",
"const -8.5488 3.211 -2.662 0.008 -14.843 -2.254\n",
"x1 0.1183 0.066 1.783 0.075 -0.012 0.248\n",
"x2 0.1613 0.061 2.642 0.008 0.042 0.281\n",
"==============================================================================\n"
]
}
],
"source": [
"modelHoldOut = sm.Logit(\n",
" trainSet[\"y\"],\n",
" sm.add_constant(trainSet[[\"x1\", \"x2\"]])\n",
")\n",
"modelHoldOutFit = modelHoldOut.fit()\n",
"print(modelHoldOutFit.summary())"
]
},
{
"cell_type": "code",
"execution_count": 90,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Logit Regression Results \n",
"==============================================================================\n",
"Dep. Variable: y No. Observations: 30\n",
"Model: Logit Df Residuals: 28\n",
"Method: MLE Df Model: 1\n",
"Date: Sun, 09 Jun 2024 Pseudo R-squ.: 0.1525\n",
"Time: 15:09:54 Log-Likelihood: -17.397\n",
"converged: True LL-Null: -20.527\n",
"Covariance Type: nonrobust LLR p-value: 0.01235\n",
"==============================================================================\n",
" coef std err z P>|z| [0.025 0.975]\n",
"------------------------------------------------------------------------------\n",
"const -2.2077 1.140 -1.936 0.053 -4.442 0.027\n",
"x1 0.1043 0.046 2.282 0.022 0.015 0.194\n",
"==============================================================================\n"
]
}
],
"source": [
"print(model4Fit.summary())"
]
},
{
"cell_type": "code",
"execution_count": 91,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 206
},
"id": "nRTs2yv9alHP",
"outputId": "f0305ea5-32c2-45b5-e09e-73dc9b467c1c"
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>y</th>\n",
" <th>x1</th>\n",
" <th>x2</th>\n",
" <th>predict1</th>\n",
" <th>predict2</th>\n",
" <th>yHat2</th>\n",
" <th>predictHoldOut</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>23</th>\n",
" <td>0</td>\n",
" <td>11.75</td>\n",
" <td>32.96</td>\n",
" <td>0.204600</td>\n",
" <td>0.129233</td>\n",
" <td>0</td>\n",
" <td>0.136728</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>1</td>\n",
" <td>18.28</td>\n",
" <td>39.50</td>\n",
" <td>0.496558</td>\n",
" <td>0.534624</td>\n",
" <td>1</td>\n",
" <td>0.496200</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25</th>\n",
" <td>0</td>\n",
" <td>11.43</td>\n",
" <td>34.78</td>\n",
" <td>0.245584</td>\n",
" <td>0.164303</td>\n",
" <td>0</td>\n",
" <td>0.169793</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>1</td>\n",
" <td>31.81</td>\n",
" <td>47.56</td>\n",
" <td>0.959656</td>\n",
" <td>0.967716</td>\n",
" <td>1</td>\n",
" <td>0.947146</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>1</td>\n",
" <td>28.74</td>\n",
" <td>35.63</td>\n",
" <td>0.593656</td>\n",
" <td>0.702665</td>\n",
" <td>1</td>\n",
" <td>0.645341</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" y x1 x2 predict1 predict2 yHat2 predictHoldOut\n",
"23 0 11.75 32.96 0.204600 0.129233 0 0.136728\n",
"10 1 18.28 39.50 0.496558 0.534624 1 0.496200\n",
"25 0 11.43 34.78 0.245584 0.164303 0 0.169793\n",
"16 1 31.81 47.56 0.959656 0.967716 1 0.947146\n",
"9 1 28.74 35.63 0.593656 0.702665 1 0.645341"
]
},
"execution_count": 91,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"predictHoldOut = modelHoldOutFit.predict(sm.add_constant(testSet[[\"x1\", \"x2\"]]))\n",
"testSet['predictHoldOut'] = predictHoldOut\n",
"testSet"
]
},
{
"cell_type": "code",
"execution_count": 92,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 206
},
"id": "1SXIG-xRbFc-",
"outputId": "011931e1-25e3-4261-9103-5aad2b1371ab"
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>y</th>\n",
" <th>x1</th>\n",
" <th>x2</th>\n",
" <th>predict1</th>\n",
" <th>predict2</th>\n",
" <th>yHat2</th>\n",
" <th>predictHoldOut</th>\n",
" <th>yHatHoldOut</th>\n",
" <th>isHoldOutCorrect</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>23</th>\n",
" <td>0</td>\n",
" <td>11.75</td>\n",
" <td>32.96</td>\n",
" <td>0.204600</td>\n",
" <td>0.129233</td>\n",
" <td>0</td>\n",
" <td>0.136728</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>1</td>\n",
" <td>18.28</td>\n",
" <td>39.50</td>\n",
" <td>0.496558</td>\n",
" <td>0.534624</td>\n",
" <td>1</td>\n",
" <td>0.496200</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25</th>\n",
" <td>0</td>\n",
" <td>11.43</td>\n",
" <td>34.78</td>\n",
" <td>0.245584</td>\n",
" <td>0.164303</td>\n",
" <td>0</td>\n",
" <td>0.169793</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>1</td>\n",
" <td>31.81</td>\n",
" <td>47.56</td>\n",
" <td>0.959656</td>\n",
" <td>0.967716</td>\n",
" <td>1</td>\n",
" <td>0.947146</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>1</td>\n",
" <td>28.74</td>\n",
" <td>35.63</td>\n",
" <td>0.593656</td>\n",
" <td>0.702665</td>\n",
" <td>1</td>\n",
" <td>0.645341</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" y x1 x2 predict1 predict2 yHat2 predictHoldOut yHatHoldOut \\\n",
"23 0 11.75 32.96 0.204600 0.129233 0 0.136728 0 \n",
"10 1 18.28 39.50 0.496558 0.534624 1 0.496200 0 \n",
"25 0 11.43 34.78 0.245584 0.164303 0 0.169793 0 \n",
"16 1 31.81 47.56 0.959656 0.967716 1 0.947146 1 \n",
"9 1 28.74 35.63 0.593656 0.702665 1 0.645341 1 \n",
"\n",
" isHoldOutCorrect \n",
"23 1 \n",
"10 0 \n",
"25 1 \n",
"16 1 \n",
"9 1 "
]
},
"execution_count": 92,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"testSet['yHatHoldOut'] = testSet['predictHoldOut'].apply(lambda x: 1 if x > 0.5 else 0)\n",
"testSet['isHoldOutCorrect'] = testSet.apply(lambda row: 1 if row['y'] == row['yHatHoldOut'] else 0, axis=1)\n",
"testSet"
]
},
{
"cell_type": "code",
"execution_count": 93,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "bntHtTtwbMYi",
"outputId": "1bfe2cc6-2034-4f1f-864b-630b19fcc96d"
},
"outputs": [
{
"data": {
"text/plain": [
"80.0"
]
},
"execution_count": 93,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"accuracy = (np.sum(testSet['isHoldOutCorrect']) / len(testSet['yHatHoldOut'])) * 100\n",
"accuracy"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "z7KjTxz4caDz"
},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "OVzr96gecftN"
},
"source": [
"K-Fold Cross validation"
]
},
{
"cell_type": "code",
"execution_count": 94,
"metadata": {
"id": "x56ASbXkciNv"
},
"outputs": [],
"source": [
"from sklearn.model_selection import KFold"
]
},
{
"cell_type": "code",
"execution_count": 95,
"metadata": {
"id": "tjliLeknckTS"
},
"outputs": [],
"source": [
"# Initialize KFold\n",
"kf = KFold(n_splits=5, shuffle=True, random_state=55)\n"
]
},
{
"cell_type": "code",
"execution_count": 96,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "2Inr3vF2cn14",
"outputId": "a2ac2909-88e6-49a9-8d44-c1c50a44bf0e"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Optimization terminated successfully.\n",
" Current function value: 0.385918\n",
" Iterations 7\n",
"expr=1\n",
" Logit Regression Results \n",
"==============================================================================\n",
"Dep. Variable: y No. Observations: 24\n",
"Model: Logit Df Residuals: 21\n",
"Method: MLE Df Model: 2\n",
"Date: Sun, 09 Jun 2024 Pseudo R-squ.: 0.4318\n",
"Time: 15:09:54 Log-Likelihood: -9.2620\n",
"converged: True LL-Null: -16.301\n",
"Covariance Type: nonrobust LLR p-value: 0.0008773\n",
"==============================================================================\n",
" coef std err z P>|z| [0.025 0.975]\n",
"------------------------------------------------------------------------------\n",
"const -8.3046 3.201 -2.595 0.009 -14.578 -2.031\n",
"x1 0.1361 0.068 2.014 0.044 0.004 0.269\n",
"x2 0.1491 0.062 2.397 0.017 0.027 0.271\n",
"==============================================================================\n",
"Optimization terminated successfully.\n",
" Current function value: 0.330523\n",
" Iterations 8\n",
"expr=2\n",
" Logit Regression Results \n",
"==============================================================================\n",
"Dep. Variable: y No. Observations: 24\n",
"Model: Logit Df Residuals: 21\n",
"Method: MLE Df Model: 2\n",
"Date: Sun, 09 Jun 2024 Pseudo R-squ.: 0.5208\n",
"Time: 15:09:54 Log-Likelihood: -7.9326\n",
"converged: True LL-Null: -16.552\n",
"Covariance Type: nonrobust LLR p-value: 0.0001805\n",
"==============================================================================\n",
" coef std err z P>|z| [0.025 0.975]\n",
"------------------------------------------------------------------------------\n",
"const -10.9794 4.293 -2.557 0.011 -19.394 -2.565\n",
"x1 0.1529 0.080 1.912 0.056 -0.004 0.310\n",
"x2 0.2185 0.092 2.384 0.017 0.039 0.398\n",
"==============================================================================\n",
"Optimization terminated successfully.\n",
" Current function value: 0.222706\n",
" Iterations 8\n",
"expr=3\n",
" Logit Regression Results \n",
"==============================================================================\n",
"Dep. Variable: y No. Observations: 24\n",
"Model: Logit Df Residuals: 21\n",
"Method: MLE Df Model: 2\n",
"Date: Sun, 09 Jun 2024 Pseudo R-squ.: 0.6771\n",
"Time: 15:09:54 Log-Likelihood: -5.3449\n",
"converged: True LL-Null: -16.552\n",
"Covariance Type: nonrobust LLR p-value: 1.358e-05\n",
"==============================================================================\n",
" coef std err z P>|z| [0.025 0.975]\n",
"------------------------------------------------------------------------------\n",
"const -15.7518 6.276 -2.510 0.012 -28.052 -3.451\n",
"x1 0.1539 0.101 1.524 0.127 -0.044 0.352\n",
"x2 0.3209 0.139 2.316 0.021 0.049 0.592\n",
"==============================================================================\n",
"Optimization terminated successfully.\n",
" Current function value: 0.433994\n",
" Iterations 7\n",
"expr=4\n",
" Logit Regression Results \n",
"==============================================================================\n",
"Dep. Variable: y No. Observations: 24\n",
"Model: Logit Df Residuals: 21\n",
"Method: MLE Df Model: 2\n",
"Date: Sun, 09 Jun 2024 Pseudo R-squ.: 0.3707\n",
"Time: 15:09:54 Log-Likelihood: -10.416\n",
"converged: True LL-Null: -16.552\n",
"Covariance Type: nonrobust LLR p-value: 0.002163\n",
"==============================================================================\n",
" coef std err z P>|z| [0.025 0.975]\n",
"------------------------------------------------------------------------------\n",
"const -8.1881 3.232 -2.533 0.011 -14.523 -1.853\n",
"x1 0.1129 0.064 1.752 0.080 -0.013 0.239\n",
"x2 0.1604 0.065 2.453 0.014 0.032 0.289\n",
"==============================================================================\n",
"Optimization terminated successfully.\n",
" Current function value: 0.368962\n",
" Iterations 7\n",
"expr=5\n",
" Logit Regression Results \n",
"==============================================================================\n",
"Dep. Variable: y No. Observations: 24\n",
"Model: Logit Df Residuals: 21\n",
"Method: MLE Df Model: 2\n",
"Date: Sun, 09 Jun 2024 Pseudo R-squ.: 0.4423\n",
"Time: 15:09:54 Log-Likelihood: -8.8551\n",
"converged: True LL-Null: -15.878\n",
"Covariance Type: nonrobust LLR p-value: 0.0008917\n",
"==============================================================================\n",
" coef std err z P>|z| [0.025 0.975]\n",
"------------------------------------------------------------------------------\n",
"const -8.1001 3.273 -2.475 0.013 -14.516 -1.685\n",
"x1 0.1358 0.070 1.952 0.051 -0.001 0.272\n",
"x2 0.1503 0.067 2.229 0.026 0.018 0.282\n",
"==============================================================================\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/var/folders/v4/9b_k_xyj56ggnxlhf09pt8y40000gn/T/ipykernel_36878/46880013.py:22: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" valSet['val_predictions'] = val_predictions\n",
"/var/folders/v4/9b_k_xyj56ggnxlhf09pt8y40000gn/T/ipykernel_36878/46880013.py:23: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" valSet['yHatCross'] = valSet['val_predictions'].apply(lambda x: 1 if x > 0.5 else 0)\n",
"/var/folders/v4/9b_k_xyj56ggnxlhf09pt8y40000gn/T/ipykernel_36878/46880013.py:24: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" valSet['isCrossCorrect'] = valSet.apply(lambda row: 1 if row['y'] == row['yHatCross'] else 0, axis=1)\n",
"/var/folders/v4/9b_k_xyj56ggnxlhf09pt8y40000gn/T/ipykernel_36878/46880013.py:22: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" valSet['val_predictions'] = val_predictions\n",
"/var/folders/v4/9b_k_xyj56ggnxlhf09pt8y40000gn/T/ipykernel_36878/46880013.py:23: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" valSet['yHatCross'] = valSet['val_predictions'].apply(lambda x: 1 if x > 0.5 else 0)\n",
"/var/folders/v4/9b_k_xyj56ggnxlhf09pt8y40000gn/T/ipykernel_36878/46880013.py:24: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" valSet['isCrossCorrect'] = valSet.apply(lambda row: 1 if row['y'] == row['yHatCross'] else 0, axis=1)\n",
"/var/folders/v4/9b_k_xyj56ggnxlhf09pt8y40000gn/T/ipykernel_36878/46880013.py:22: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" valSet['val_predictions'] = val_predictions\n",
"/var/folders/v4/9b_k_xyj56ggnxlhf09pt8y40000gn/T/ipykernel_36878/46880013.py:23: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" valSet['yHatCross'] = valSet['val_predictions'].apply(lambda x: 1 if x > 0.5 else 0)\n",
"/var/folders/v4/9b_k_xyj56ggnxlhf09pt8y40000gn/T/ipykernel_36878/46880013.py:24: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" valSet['isCrossCorrect'] = valSet.apply(lambda row: 1 if row['y'] == row['yHatCross'] else 0, axis=1)\n",
"/var/folders/v4/9b_k_xyj56ggnxlhf09pt8y40000gn/T/ipykernel_36878/46880013.py:22: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" valSet['val_predictions'] = val_predictions\n",
"/var/folders/v4/9b_k_xyj56ggnxlhf09pt8y40000gn/T/ipykernel_36878/46880013.py:23: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" valSet['yHatCross'] = valSet['val_predictions'].apply(lambda x: 1 if x > 0.5 else 0)\n",
"/var/folders/v4/9b_k_xyj56ggnxlhf09pt8y40000gn/T/ipykernel_36878/46880013.py:24: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" valSet['isCrossCorrect'] = valSet.apply(lambda row: 1 if row['y'] == row['yHatCross'] else 0, axis=1)\n",
"/var/folders/v4/9b_k_xyj56ggnxlhf09pt8y40000gn/T/ipykernel_36878/46880013.py:22: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" valSet['val_predictions'] = val_predictions\n",
"/var/folders/v4/9b_k_xyj56ggnxlhf09pt8y40000gn/T/ipykernel_36878/46880013.py:23: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" valSet['yHatCross'] = valSet['val_predictions'].apply(lambda x: 1 if x > 0.5 else 0)\n",
"/var/folders/v4/9b_k_xyj56ggnxlhf09pt8y40000gn/T/ipykernel_36878/46880013.py:24: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" valSet['isCrossCorrect'] = valSet.apply(lambda row: 1 if row['y'] == row['yHatCross'] else 0, axis=1)\n"
]
}
],
"source": [
"check = kf.split(mortgageDf)\n",
"check\n",
"experiment = 1\n",
"# Loop through each fold\n",
"# Initialize variables to store results\n",
"accuracies = []\n",
"\n",
"for train_index, val_index in check:\n",
" # Split the data\n",
" trainSet, valSet = mortgageDf.iloc[train_index], mortgageDf.iloc[val_index]\n",
"\n",
" # Fit the model\n",
"\n",
" trainModel = sm.Logit(\n",
" trainSet[\"y\"],\n",
" sm.add_constant(trainSet[[\"x1\", \"x2\"]])\n",
" )\n",
" trainModelFit = trainModel.fit()\n",
"\n",
" # Predict on the validation set\n",
" val_predictions = trainModelFit.predict(sm.add_constant(valSet[[\"x1\", \"x2\"]]))\n",
" valSet['val_predictions'] = val_predictions\n",
" valSet['yHatCross'] = valSet['val_predictions'].apply(lambda x: 1 if x > 0.5 else 0)\n",
" valSet['isCrossCorrect'] = valSet.apply(lambda row: 1 if row['y'] == row['yHatCross'] else 0, axis=1)\n",
" accuracy = (np.sum(valSet['isCrossCorrect']) / len(valSet['yHatCross'])) * 100\n",
" accuracies.append(accuracy)\n",
"\n",
"\n",
" # Print summary for each fold (optional)\n",
" print(f'expr={experiment}')\n",
" experiment = experiment +1\n",
" print(trainModelFit.summary())"
]
},
{
"cell_type": "code",
"execution_count": 97,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "ZrdsnnRhfE9w",
"outputId": "78aaab6b-1f66-43cf-c2a7-c18f84430a8e"
},
"outputs": [
{
"data": {
"text/plain": [
"[100.0, 83.33333333333334, 66.66666666666666, 100.0, 83.33333333333334]"
]
},
"execution_count": 97,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"accuracies"
]
},
{
"cell_type": "code",
"execution_count": 98,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "ZPFkA9gAfLyJ",
"outputId": "f90e1a89-2679-4daf-98db-349e5f8ed1db"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Average accuracies across all folds: 86.66666666666667\n"
]
}
],
"source": [
"print(f\"Average accuracies across all folds: {sum(accuracies) /len(accuracies)}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"colab": {
"provenance": []
},
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}