{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": { "id": "xwFyEsosINqT" }, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "pKewSQysItJ-" }, "outputs": [], "source": [ "# https://www.statsmodels.org/stable/index.html\n", "import statsmodels.api as sm" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "Lz-DyAtNWsJR" }, "outputs": [], "source": [ "# Download Dataset from https://www.dropbox.com/scl/fi/bkcdp9tpqqh6dfr6phtt8/AnnArbor.xlsx?rlkey=0agfqwc7f0kt7oqb3e2h6q3qs&dl=1\n", "# and add it to colab" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "0zM8FGMJXJ70" }, "outputs": [], "source": [ "# annArborDf = pd.read_excel(\"./AnnArbor.xlsx\")\n", "annArborDf = pd.read_excel(\"https://www.dropbox.com/scl/fi/bkcdp9tpqqh6dfr6phtt8/AnnArbor.xlsx?rlkey=0agfqwc7f0kt7oqb3e2h6q3qs&dl=1\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 1000 }, "id": "t0LUca0Myqw5", "outputId": "249ab087-895f-4fa6-993e-e8dd50ef87c1" }, "outputs": [], "source": [ "annArborDf" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "GQRNPIeyy6ub", "outputId": "00211933-f2b1-40c6-d9cf-187560ffa305" }, "outputs": [], "source": [ "annArborDf.size" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "yumMybniy85d" }, "outputs": [], "source": [ "annArborDf.describe()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "aspq6hoPy_xZ", "outputId": "96892272-a1d5-400e-a177-6c96746619d8" }, "outputs": [], "source": [ "annArborDf.shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "z_hVTvPrzYJr" }, "outputs": [], "source": [ "import matplotlib.pyplot as plt" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 34 }, "id": "pIniVuaIzaaZ", "outputId": "6a061f6a-8bff-42c0-d705-0c2bd06eb5ff" }, "outputs": [], "source": [ "# Plotting\n", "fig1 = plt.figure(\n", " figsize=(8, 8)\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 449 }, "id": "VHdpDE7o42Pf", "outputId": "ac876802-b6d1-4926-d069-0532ee9e7a0b" }, "outputs": [], "source": [ "plt.scatter(\n", " annArborDf[\"Beds\"],\n", " annArborDf[\"Rent\"],\n", " color='blue',\n", " alpha=0.9,\n", " label='Data Points - scatter',\n", ")\n", "\n", "plt.xlabel('Beds')\n", "plt.ylabel('Rent')\n", "plt.legend()\n", "plt.grid(True)\n", "\n", "\n", "\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 449 }, "id": "knAa4W9R47rZ", "outputId": "81359d91-03b7-4f70-c381-c88172f800a9" }, "outputs": [], "source": [ "plt.scatter(\n", " annArborDf[\"Baths\"],\n", " annArborDf[\"Rent\"],\n", " color='blue',\n", " alpha=0.9,\n", " label='Data Points - scatter',\n", ")\n", "\n", "plt.xlabel('Baths')\n", "plt.ylabel('Rent')\n", "plt.legend()\n", "plt.grid(True)\n", "\n", "\n", "\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 449 }, "id": "dOnWJbFOzczV", "outputId": "c6d6b86b-dd85-45d1-b543-928441c11dc4" }, "outputs": [], "source": [ "plt.scatter(\n", " annArborDf[\"Sqft\"],\n", " annArborDf[\"Rent\"],\n", " color='blue',\n", " alpha=0.9,\n", " label='Data Points - scatter',\n", ")\n", "\n", "plt.xlabel('Sqft')\n", "plt.ylabel('Rent')\n", "plt.legend()\n", "plt.grid(True)\n", "\n", "\n", "\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "alIhUPPUzvli", "outputId": "8ed14c4b-a596-49ac-912a-0dcb4145df89" }, "outputs": [], "source": [ "rentSqftModel1 = sm.OLS(\n", " annArborDf[\"Rent\"],\n", " sm.add_constant(annArborDf[[\"Sqft\"]])\n", ")\n", "rentSqftModel1Fit = rentSqftModel1.fit()\n", "print(rentSqftModel1Fit.summary())" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from functions.exportModel import exportModel\n", "exportModel({\n", " \"modelName\": \"rentSqftModel1Fit\",\n", " \"model\": rentSqftModel1Fit,\n", " \"description\": \"Predict Rent based on Sqft for annArborDf\",\n", " \"modelType\": \"sm.OLS\",\n", " \"baseRelativePath\": \"..\",\n", " \"inputs\": [\n", " {\n", " \"name\": \"const\",\n", " \"type\": \"int\"\n", " },\n", " {\n", " \"name\": \"Sqft\",\n", " \"type\": \"float\"\n", " }\n", " ],\n", " \"output\": {\n", " \"name\": \"Rent\",\n", " \"type\": \"float\"\n", " }\n", "})" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 1000 }, "id": "S-AyfiLN0Due", "outputId": "aacd248d-5a72-4ce0-ab0a-048f30d398ca" }, "outputs": [], "source": [ "predictedRent1 = rentSqftModel1Fit.predict(sm.add_constant(annArborDf[\"Sqft\"]))\n", "annArborDf['predictedRent1'] = predictedRent1\n", "annArborDf" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "9ouX-mzz4sl-" }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 454 }, "id": "L55GN8hZ4wXi", "outputId": "712ace2c-5a04-48e0-acf0-cc42430f2aa9" }, "outputs": [], "source": [ "plt.scatter(\n", " annArborDf[\"Rent\"],\n", " annArborDf[\"Sqft\"],\n", " color='blue',\n", " alpha=0.5,\n", " label='Data Points - scatter',\n", ")\n", "\n", "intercept = rentSqftModel1Fit.params['const']\n", "sqFtSlope = rentSqftModel1Fit.params['Sqft']\n", "x_values = np.linspace(500, 4500, 200)\n", "y_values = intercept + sqFtSlope * x_values\n", "\n", "plt.plot(\n", " x_values,\n", " y_values,\n", " color='red',\n", " label='rentSqftModel1Fit - predictedRent1'\n", ")\n", "plt.xlabel('Sqft')\n", "plt.ylabel('Rent')\n", "plt.legend()\n", "plt.grid(True)\n", "\n", "\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "swSVnmy44Ddg", "outputId": "251afab3-0563-4eb7-e23a-b526238c7584" }, "outputs": [], "source": [ "rentBedsBathsSqftModel = sm.OLS(\n", " annArborDf[\"Rent\"],\n", " sm.add_constant(annArborDf[[\"Beds\", \"Baths\", \"Sqft\"]])\n", ")\n", "rentBedsBathsSqftModelFit = rentBedsBathsSqftModel.fit()\n", "print(rentBedsBathsSqftModelFit.summary())" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from functions.exportModel import exportModel\n", "exportModel({\n", " \"modelName\": \"rentBedsBathsSqftModelFit\",\n", " \"model\": rentBedsBathsSqftModelFit,\n", " \"description\": \"Predict Rent based on Beds,Baths,Sqft for annArborDf\",\n", " \"modelType\": \"sm.OLS\",\n", " \"baseRelativePath\": \"..\",\n", " \"inputs\": [\n", " {\n", " \"name\": \"const\",\n", " \"type\": \"int\"\n", " },\n", " {\n", " \"name\": \"Beds\",\n", " \"type\": \"int\"\n", " },\n", " {\n", " \"name\": \"Baths\",\n", " \"type\": \"int\"\n", " },\n", " {\n", " \"name\": \"Sqft\",\n", " \"type\": \"float\"\n", " }\n", " \n", " ],\n", " \"output\": {\n", " \"name\": \"Rent\",\n", " \"type\": \"float\"\n", " }\n", "})" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "6lKEw7Wt57Px" }, "outputs": [], "source": [ "import math" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 1000 }, "id": "da3o51IG5u7r", "outputId": "abe849ba-7689-468c-f327-b183c4d3f70a" }, "outputs": [], "source": [ "from functions.transformers import transformersDict\n", "# annArborDf['log(Sqft)'] = annArborDf.apply(lambda row: math.log(row['Sqft']), axis=1)\n", "annArborDf['log(Sqft)'] = annArborDf.apply(transformersDict.get('Sqft_log'), axis=1)\n", "annArborDf" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "lYYrtI0O5lSG", "outputId": "6a980e88-5630-4e5e-f887-875ab5f1d748" }, "outputs": [], "source": [ "rentBedsBathsLogSqftModel= sm.OLS(\n", " annArborDf[\"Rent\"],\n", " sm.add_constant(annArborDf[[\"Beds\", \"Baths\", \"log(Sqft)\"]])\n", ")\n", "rentBedsBathsLogSqftModelFit = rentBedsBathsLogSqftModel.fit()\n", "print(rentBedsBathsLogSqftModelFit.summary())" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from functions.exportModel import exportModel\n", "exportModel({\n", " \"modelName\": \"rentBedsBathsLogSqftModelFit\",\n", " \"model\": rentBedsBathsLogSqftModelFit,\n", " \"description\": \"Predict Rent based on Beds,Baths,log(Sqft) for annArborDf\",\n", " \"modelType\": \"sm.OLS\",\n", " \"baseRelativePath\": \"..\",\n", " \"inputs\": [\n", " {\n", " \"name\": \"const\",\n", " \"type\": \"int\"\n", " },\n", " {\n", " \"name\": \"Beds\",\n", " \"type\": \"int\"\n", " },\n", " {\n", " \"name\": \"Baths\",\n", " \"type\": \"int\"\n", " },\n", " {\n", " \"name\": \"Sqft\",\n", " \"type\": \"float\"\n", " }\n", " \n", " ],\n", " \"transformers\":[\n", " {\n", " \"name\": \"log(Sqft)\",\n", " \"transformer\": \"Sqft_log\"\n", " }\n", " ],\n", " \"output\": {\n", " \"name\": \"Rent\",\n", " \"type\": \"float\"\n", " }\n", "})" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "amUWG6386dyn" }, "outputs": [], "source": [ "annArborDf['log(Rent)'] = annArborDf.apply(lambda row: math.log(row['Rent']), axis=1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "LxcjPBLn6iAq", "outputId": "f827bc12-0083-4fb9-ea95-53a58cc0999b" }, "outputs": [], "source": [ "rentSqftModel4 = sm.OLS(\n", " annArborDf[\"log(Rent)\"],\n", " sm.add_constant(annArborDf[[\"Beds\", \"Baths\", \"Sqft\"]])\n", ")\n", "rentSqftModel4Fit = rentSqftModel4.fit()\n", "print(rentSqftModel4Fit.summary())" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "WM5h3QnN60IY", "outputId": "56dd02c1-b8a8-4fcc-951f-676d574e6a62" }, "outputs": [], "source": [ "logRentBedsBathsLogSqftModel = sm.OLS(\n", " annArborDf[\"log(Rent)\"],\n", " sm.add_constant(annArborDf[[\"Beds\", \"Baths\", \"log(Sqft)\"]])\n", ")\n", "logRentBedsBathsLogSqftModelFit = logRentBedsBathsLogSqftModel.fit()\n", "print(logRentBedsBathsLogSqftModelFit.summary())" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from functions.exportModel import exportModel\n", "exportModel({\n", " \"modelName\": \"logRentBedsBathsLogSqftModelFit\",\n", " \"model\": logRentBedsBathsLogSqftModelFit,\n", " \"description\": \"Predict log(Rent) based on Beds,Baths,log(Sqft) for annArborDf\",\n", " \"modelType\": \"sm.OLS\",\n", " \"baseRelativePath\": \"..\",\n", " \"inputs\": [\n", " {\n", " \"name\": \"const\",\n", " \"type\": \"int\"\n", " },\n", " {\n", " \"name\": \"Beds\",\n", " \"type\": \"int\"\n", " },\n", " {\n", " \"name\": \"Baths\",\n", " \"type\": \"int\"\n", " },\n", " {\n", " \"name\": \"Sqft\",\n", " \"type\": \"float\"\n", " }\n", " \n", " ],\n", " \"transformers\":[\n", " {\n", " \"name\": \"log(Sqft)\",\n", " \"transformer\": \"Sqft_log\"\n", " }\n", " ],\n", " \"output\": {\n", " \"name\": \"log(Rent)\",\n", " \"type\": \"float\"\n", " }\n", "})" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "1PHrUcM6694a", "outputId": "7b463d70-25d1-4073-bf7e-4e93f31c5fb2" }, "outputs": [], "source": [ "rentSqftModel6 = sm.OLS(\n", " annArborDf[\"log(Rent)\"],\n", " sm.add_constant(annArborDf[[\"Beds\", \"log(Sqft)\"]])\n", ")\n", "rentSqftModel6Fit = rentSqftModel6.fit()\n", "print(rentSqftModel6Fit.summary())" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 430 }, "id": "BybWTp_k7hzc", "outputId": "335b1499-534c-47d2-bdb6-7c0f3b456160" }, "outputs": [], "source": [ "# plt.scatter(\n", "# annArborDf[\"Sqft\"],\n", "# annArborDf[\"Rent\"],\n", "# color='blue',\n", "# alpha=0.9,\n", "# label='Data Points - scatter',\n", "# )\n", "\n", "plt.scatter(\n", " annArborDf[\"log(Sqft)\"],\n", " annArborDf[\"Rent\"],\n", " color='red',\n", " alpha=0.9,\n", " label='Data Points - scatter',\n", ")\n", "\n", "# plt.scatter(\n", "# annArborDf[\"log(Sqft)\"],\n", "# annArborDf[\"log(Rent)\"],\n", "# color='Green',\n", "# alpha=0.9,\n", "# label='Data Points - scatter',\n", "# )\n", "\n", "# plt.scatter(\n", "# annArborDf[\"Sqft\"],\n", "# annArborDf[\"log(Rent)\"],\n", "# color='Yellow',\n", "# alpha=0.9,\n", "# label='Data Points - scatter',\n", "# )\n", "\n", "\n", "\n", "# plt.xlabel('Sqft')\n", "plt.ylabel('Rent')\n", "plt.legend()\n", "plt.grid(True)\n", "\n", "\n", "\n", "plt.show()" ] } ], "metadata": { "colab": { "provenance": [] }, "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.3" } }, "nbformat": 4, "nbformat_minor": 4 }