model-registry/notebooks/wip/Advance_regression3.ipynb

702 lines
16 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "xwFyEsosINqT"
},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "pKewSQysItJ-"
},
"outputs": [],
"source": [
"# https://www.statsmodels.org/stable/index.html\n",
"import statsmodels.api as sm"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "Lz-DyAtNWsJR"
},
"outputs": [],
"source": [
"# Download Dataset from https://www.dropbox.com/scl/fi/bkcdp9tpqqh6dfr6phtt8/AnnArbor.xlsx?rlkey=0agfqwc7f0kt7oqb3e2h6q3qs&dl=1\n",
"# and add it to colab"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "0zM8FGMJXJ70"
},
"outputs": [],
"source": [
"# annArborDf = pd.read_excel(\"./AnnArbor.xlsx\")\n",
"annArborDf = pd.read_excel(\"https://www.dropbox.com/scl/fi/bkcdp9tpqqh6dfr6phtt8/AnnArbor.xlsx?rlkey=0agfqwc7f0kt7oqb3e2h6q3qs&dl=1\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 1000
},
"id": "t0LUca0Myqw5",
"outputId": "249ab087-895f-4fa6-993e-e8dd50ef87c1"
},
"outputs": [],
"source": [
"annArborDf"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "GQRNPIeyy6ub",
"outputId": "00211933-f2b1-40c6-d9cf-187560ffa305"
},
"outputs": [],
"source": [
"annArborDf.size"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "yumMybniy85d"
},
"outputs": [],
"source": [
"annArborDf.describe()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "aspq6hoPy_xZ",
"outputId": "96892272-a1d5-400e-a177-6c96746619d8"
},
"outputs": [],
"source": [
"annArborDf.shape"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "z_hVTvPrzYJr"
},
"outputs": [],
"source": [
"import matplotlib.pyplot as plt"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 34
},
"id": "pIniVuaIzaaZ",
"outputId": "6a061f6a-8bff-42c0-d705-0c2bd06eb5ff"
},
"outputs": [],
"source": [
"# Plotting\n",
"fig1 = plt.figure(\n",
" figsize=(8, 8)\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 449
},
"id": "VHdpDE7o42Pf",
"outputId": "ac876802-b6d1-4926-d069-0532ee9e7a0b"
},
"outputs": [],
"source": [
"plt.scatter(\n",
" annArborDf[\"Beds\"],\n",
" annArborDf[\"Rent\"],\n",
" color='blue',\n",
" alpha=0.9,\n",
" label='Data Points - scatter',\n",
")\n",
"\n",
"plt.xlabel('Beds')\n",
"plt.ylabel('Rent')\n",
"plt.legend()\n",
"plt.grid(True)\n",
"\n",
"\n",
"\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 449
},
"id": "knAa4W9R47rZ",
"outputId": "81359d91-03b7-4f70-c381-c88172f800a9"
},
"outputs": [],
"source": [
"plt.scatter(\n",
" annArborDf[\"Baths\"],\n",
" annArborDf[\"Rent\"],\n",
" color='blue',\n",
" alpha=0.9,\n",
" label='Data Points - scatter',\n",
")\n",
"\n",
"plt.xlabel('Baths')\n",
"plt.ylabel('Rent')\n",
"plt.legend()\n",
"plt.grid(True)\n",
"\n",
"\n",
"\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 449
},
"id": "dOnWJbFOzczV",
"outputId": "c6d6b86b-dd85-45d1-b543-928441c11dc4"
},
"outputs": [],
"source": [
"plt.scatter(\n",
" annArborDf[\"Sqft\"],\n",
" annArborDf[\"Rent\"],\n",
" color='blue',\n",
" alpha=0.9,\n",
" label='Data Points - scatter',\n",
")\n",
"\n",
"plt.xlabel('Sqft')\n",
"plt.ylabel('Rent')\n",
"plt.legend()\n",
"plt.grid(True)\n",
"\n",
"\n",
"\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "alIhUPPUzvli",
"outputId": "8ed14c4b-a596-49ac-912a-0dcb4145df89"
},
"outputs": [],
"source": [
"rentSqftModel1 = sm.OLS(\n",
" annArborDf[\"Rent\"],\n",
" sm.add_constant(annArborDf[[\"Sqft\"]])\n",
")\n",
"rentSqftModel1Fit = rentSqftModel1.fit()\n",
"print(rentSqftModel1Fit.summary())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from functions.exportModel import exportModel\n",
"exportModel({\n",
" \"modelName\": \"rentSqftModel1Fit\",\n",
" \"model\": rentSqftModel1Fit,\n",
" \"description\": \"Predict Rent based on Sqft for annArborDf\",\n",
" \"modelType\": \"sm.OLS\",\n",
" \"baseRelativePath\": \"..\",\n",
" \"inputs\": [\n",
" {\n",
" \"name\": \"const\",\n",
" \"type\": \"int\"\n",
" },\n",
" {\n",
" \"name\": \"Sqft\",\n",
" \"type\": \"float\"\n",
" }\n",
" ],\n",
" \"output\": {\n",
" \"name\": \"Rent\",\n",
" \"type\": \"float\"\n",
" }\n",
"})"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 1000
},
"id": "S-AyfiLN0Due",
"outputId": "aacd248d-5a72-4ce0-ab0a-048f30d398ca"
},
"outputs": [],
"source": [
"predictedRent1 = rentSqftModel1Fit.predict(sm.add_constant(annArborDf[\"Sqft\"]))\n",
"annArborDf['predictedRent1'] = predictedRent1\n",
"annArborDf"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "9ouX-mzz4sl-"
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 454
},
"id": "L55GN8hZ4wXi",
"outputId": "712ace2c-5a04-48e0-acf0-cc42430f2aa9"
},
"outputs": [],
"source": [
"plt.scatter(\n",
" annArborDf[\"Rent\"],\n",
" annArborDf[\"Sqft\"],\n",
" color='blue',\n",
" alpha=0.5,\n",
" label='Data Points - scatter',\n",
")\n",
"\n",
"intercept = rentSqftModel1Fit.params['const']\n",
"sqFtSlope = rentSqftModel1Fit.params['Sqft']\n",
"x_values = np.linspace(500, 4500, 200)\n",
"y_values = intercept + sqFtSlope * x_values\n",
"\n",
"plt.plot(\n",
" x_values,\n",
" y_values,\n",
" color='red',\n",
" label='rentSqftModel1Fit - predictedRent1'\n",
")\n",
"plt.xlabel('Sqft')\n",
"plt.ylabel('Rent')\n",
"plt.legend()\n",
"plt.grid(True)\n",
"\n",
"\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "swSVnmy44Ddg",
"outputId": "251afab3-0563-4eb7-e23a-b526238c7584"
},
"outputs": [],
"source": [
"rentBedsBathsSqftModel = sm.OLS(\n",
" annArborDf[\"Rent\"],\n",
" sm.add_constant(annArborDf[[\"Beds\", \"Baths\", \"Sqft\"]])\n",
")\n",
"rentBedsBathsSqftModelFit = rentBedsBathsSqftModel.fit()\n",
"print(rentBedsBathsSqftModelFit.summary())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from functions.exportModel import exportModel\n",
"exportModel({\n",
" \"modelName\": \"rentBedsBathsSqftModelFit\",\n",
" \"model\": rentBedsBathsSqftModelFit,\n",
" \"description\": \"Predict Rent based on Beds,Baths,Sqft for annArborDf\",\n",
" \"modelType\": \"sm.OLS\",\n",
" \"baseRelativePath\": \"..\",\n",
" \"inputs\": [\n",
" {\n",
" \"name\": \"const\",\n",
" \"type\": \"int\"\n",
" },\n",
" {\n",
" \"name\": \"Beds\",\n",
" \"type\": \"int\"\n",
" },\n",
" {\n",
" \"name\": \"Baths\",\n",
" \"type\": \"int\"\n",
" },\n",
" {\n",
" \"name\": \"Sqft\",\n",
" \"type\": \"float\"\n",
" }\n",
" \n",
" ],\n",
" \"output\": {\n",
" \"name\": \"Rent\",\n",
" \"type\": \"float\"\n",
" }\n",
"})"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "6lKEw7Wt57Px"
},
"outputs": [],
"source": [
"import math"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 1000
},
"id": "da3o51IG5u7r",
"outputId": "abe849ba-7689-468c-f327-b183c4d3f70a"
},
"outputs": [],
"source": [
"from functions.transformers import transformersDict\n",
"# annArborDf['log(Sqft)'] = annArborDf.apply(lambda row: math.log(row['Sqft']), axis=1)\n",
"annArborDf['log(Sqft)'] = annArborDf.apply(transformersDict.get('Sqft_log'), axis=1)\n",
"annArborDf"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "lYYrtI0O5lSG",
"outputId": "6a980e88-5630-4e5e-f887-875ab5f1d748"
},
"outputs": [],
"source": [
"rentBedsBathsLogSqftModel= sm.OLS(\n",
" annArborDf[\"Rent\"],\n",
" sm.add_constant(annArborDf[[\"Beds\", \"Baths\", \"log(Sqft)\"]])\n",
")\n",
"rentBedsBathsLogSqftModelFit = rentBedsBathsLogSqftModel.fit()\n",
"print(rentBedsBathsLogSqftModelFit.summary())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from functions.exportModel import exportModel\n",
"exportModel({\n",
" \"modelName\": \"rentBedsBathsLogSqftModelFit\",\n",
" \"model\": rentBedsBathsLogSqftModelFit,\n",
" \"description\": \"Predict Rent based on Beds,Baths,log(Sqft) for annArborDf\",\n",
" \"modelType\": \"sm.OLS\",\n",
" \"baseRelativePath\": \"..\",\n",
" \"inputs\": [\n",
" {\n",
" \"name\": \"const\",\n",
" \"type\": \"int\"\n",
" },\n",
" {\n",
" \"name\": \"Beds\",\n",
" \"type\": \"int\"\n",
" },\n",
" {\n",
" \"name\": \"Baths\",\n",
" \"type\": \"int\"\n",
" },\n",
" {\n",
" \"name\": \"Sqft\",\n",
" \"type\": \"float\"\n",
" }\n",
" \n",
" ],\n",
" \"transformers\":[\n",
" {\n",
" \"name\": \"log(Sqft)\",\n",
" \"transformer\": \"Sqft_log\"\n",
" }\n",
" ],\n",
" \"output\": {\n",
" \"name\": \"Rent\",\n",
" \"type\": \"float\"\n",
" }\n",
"})"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "amUWG6386dyn"
},
"outputs": [],
"source": [
"annArborDf['log(Rent)'] = annArborDf.apply(lambda row: math.log(row['Rent']), axis=1)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "LxcjPBLn6iAq",
"outputId": "f827bc12-0083-4fb9-ea95-53a58cc0999b"
},
"outputs": [],
"source": [
"rentSqftModel4 = sm.OLS(\n",
" annArborDf[\"log(Rent)\"],\n",
" sm.add_constant(annArborDf[[\"Beds\", \"Baths\", \"Sqft\"]])\n",
")\n",
"rentSqftModel4Fit = rentSqftModel4.fit()\n",
"print(rentSqftModel4Fit.summary())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "WM5h3QnN60IY",
"outputId": "56dd02c1-b8a8-4fcc-951f-676d574e6a62"
},
"outputs": [],
"source": [
"logRentBedsBathsLogSqftModel = sm.OLS(\n",
" annArborDf[\"log(Rent)\"],\n",
" sm.add_constant(annArborDf[[\"Beds\", \"Baths\", \"log(Sqft)\"]])\n",
")\n",
"logRentBedsBathsLogSqftModelFit = logRentBedsBathsLogSqftModel.fit()\n",
"print(logRentBedsBathsLogSqftModelFit.summary())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from functions.exportModel import exportModel\n",
"exportModel({\n",
" \"modelName\": \"logRentBedsBathsLogSqftModelFit\",\n",
" \"model\": logRentBedsBathsLogSqftModelFit,\n",
" \"description\": \"Predict log(Rent) based on Beds,Baths,log(Sqft) for annArborDf\",\n",
" \"modelType\": \"sm.OLS\",\n",
" \"baseRelativePath\": \"..\",\n",
" \"inputs\": [\n",
" {\n",
" \"name\": \"const\",\n",
" \"type\": \"int\"\n",
" },\n",
" {\n",
" \"name\": \"Beds\",\n",
" \"type\": \"int\"\n",
" },\n",
" {\n",
" \"name\": \"Baths\",\n",
" \"type\": \"int\"\n",
" },\n",
" {\n",
" \"name\": \"Sqft\",\n",
" \"type\": \"float\"\n",
" }\n",
" \n",
" ],\n",
" \"transformers\":[\n",
" {\n",
" \"name\": \"log(Sqft)\",\n",
" \"transformer\": \"Sqft_log\"\n",
" }\n",
" ],\n",
" \"output\": {\n",
" \"name\": \"log(Rent)\",\n",
" \"type\": \"float\"\n",
" }\n",
"})"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "1PHrUcM6694a",
"outputId": "7b463d70-25d1-4073-bf7e-4e93f31c5fb2"
},
"outputs": [],
"source": [
"rentSqftModel6 = sm.OLS(\n",
" annArborDf[\"log(Rent)\"],\n",
" sm.add_constant(annArborDf[[\"Beds\", \"log(Sqft)\"]])\n",
")\n",
"rentSqftModel6Fit = rentSqftModel6.fit()\n",
"print(rentSqftModel6Fit.summary())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 430
},
"id": "BybWTp_k7hzc",
"outputId": "335b1499-534c-47d2-bdb6-7c0f3b456160"
},
"outputs": [],
"source": [
"# plt.scatter(\n",
"# annArborDf[\"Sqft\"],\n",
"# annArborDf[\"Rent\"],\n",
"# color='blue',\n",
"# alpha=0.9,\n",
"# label='Data Points - scatter',\n",
"# )\n",
"\n",
"plt.scatter(\n",
" annArborDf[\"log(Sqft)\"],\n",
" annArborDf[\"Rent\"],\n",
" color='red',\n",
" alpha=0.9,\n",
" label='Data Points - scatter',\n",
")\n",
"\n",
"# plt.scatter(\n",
"# annArborDf[\"log(Sqft)\"],\n",
"# annArborDf[\"log(Rent)\"],\n",
"# color='Green',\n",
"# alpha=0.9,\n",
"# label='Data Points - scatter',\n",
"# )\n",
"\n",
"# plt.scatter(\n",
"# annArborDf[\"Sqft\"],\n",
"# annArborDf[\"log(Rent)\"],\n",
"# color='Yellow',\n",
"# alpha=0.9,\n",
"# label='Data Points - scatter',\n",
"# )\n",
"\n",
"\n",
"\n",
"# plt.xlabel('Sqft')\n",
"plt.ylabel('Rent')\n",
"plt.legend()\n",
"plt.grid(True)\n",
"\n",
"\n",
"\n",
"plt.show()"
]
}
],
"metadata": {
"colab": {
"provenance": []
},
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}