In [1]:
import numpy as np
import pandas as pd

In [2]:
# https://www.statsmodels.org/stable/index.html
import statsmodels.api as sm

In [3]:
# Download Dataset from https://www.dropbox.com/scl/fi/zt2vtwhpz8ndblsxqdqx1/Salary_MIS.xlsx?rlkey=2uk6m7m9w90isv6zsynhhhpyv&st=gxumjns5&dl=1
# and add it to colab

In [4]:
sallaryMisDf = pd.read_excel("https://www.dropbox.com/scl/fi/zt2vtwhpz8ndblsxqdqx1/Salary_MIS.xlsx?rlkey=2uk6m7m9w90isv6zsynhhhpyv&st=gxumjns5&dl=1")

In [5]:
# sallaryMisDf = pd.read_excel("./Salary_MIS.xlsx")

In [6]:
sallaryMisDf

Unnamed: 0,Salary,GPA,MIS,Statistics
0,72,3.53,1,0
1,66,2.86,1,0
2,72,3.69,0,0
3,63,3.24,0,0
4,65,3.21,0,0
...,...,...,...,...
115,66,3.27,0,0
116,63,2.86,1,0
117,78,3.04,1,1
118,64,2.99,0,0


In [7]:
sallaryMisDf.shape

(120, 4)

In [8]:
sallaryMisDf.describe()

Unnamed: 0,Salary,GPA,MIS,Statistics
count,120.0,120.0,120.0,120.0
mean,69.875,3.24275,0.316667,0.341667
std,6.594577,0.493834,0.467127,0.476257
min,53.0,2.41,0.0,0.0
25%,65.75,2.805,0.0,0.0
50%,70.0,3.28,0.0,0.0
75%,73.25,3.6925,1.0,1.0
max,88.0,3.98,1.0,1.0


In [9]:
sallaryMisDf.shape

(120, 4)

In [10]:
sm.add_constant(sallaryMisDf[["GPA", "MIS", "Statistics"]])

Unnamed: 0,const,GPA,MIS,Statistics
0,1.0,3.53,1,0
1,1.0,2.86,1,0
2,1.0,3.69,0,0
3,1.0,3.24,0,0
4,1.0,3.21,0,0
...,...,...,...,...
115,1.0,3.27,0,0
116,1.0,2.86,1,0
117,1.0,3.04,1,1
118,1.0,2.99,0,0


In [11]:
salaryBasedOnGpaMisStatistics = sm.OLS(
  sallaryMisDf["Salary"],
  sm.add_constant(sallaryMisDf[["GPA", "MIS", "Statistics"]])
)

In [12]:
salaryBasedOnGpaMisStatisticsFit = salaryBasedOnGpaMisStatistics.fit()

In [13]:
from functions.exportModel import exportModel
exportModel({
    "modelName": "salaryBasedOnGpaMisStatisticsFit",
    "model": salaryBasedOnGpaMisStatisticsFit,
    "description": "Predict Salary based on GPA MIS Statistics for sallaryMisDf",
    "modelType": "sm.OLS",
    "baseRelativePath": "..",
    "inputs": [
        {
            "name": "const",
            "type": "int"
        },
        {
            "name": "GPA",
            "type": "float"
        },
        {
            "name": "MIS",
            "type": "binary"
        },
        {
            "name": "Statistics",
            "type": "binary"
        }
    ],
    "output": {
        "name": "Salary",
        "type": "int"
    }
})

<statsmodels.regression.linear_model.RegressionResultsWrapper at 0x3473b7020>

In [14]:
print(salaryBasedOnGpaMisStatisticsFit.summary())

                            OLS Regression Results                            
Dep. Variable:                 Salary   R-squared:                       0.795
Model:                            OLS   Adj. R-squared:                  0.790
Method:                 Least Squares   F-statistic:                     150.3
Date:                Sun, 09 Jun 2024   Prob (F-statistic):           8.35e-40
Time:                        01:24:53   Log-Likelihood:                -300.92
No. Observations:                 120   AIC:                             609.8
Df Residuals:                     116   BIC:                             621.0
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         44.0072      1.860     23.662      0.0

In [15]:
sallaryMisDf

Unnamed: 0,Salary,GPA,MIS,Statistics
0,72,3.53,1,0
1,66,2.86,1,0
2,72,3.69,0,0
3,63,3.24,0,0
4,65,3.21,0,0
...,...,...,...,...
115,66,3.27,0,0
116,63,2.86,1,0
117,78,3.04,1,1
118,64,2.99,0,0


In [16]:
from functions.transformers import transformersDict
sallaryMisDf["misXStatistics"] = sallaryMisDf["MIS"] * sallaryMisDf["Statistics"]
# sallaryMisDf['misXStatistics1'] = sallaryMisDf.apply(lambda row: row['MIS'] * row['Statistics'], axis=1)
sallaryMisDf['misXStatistics1'] = sallaryMisDf.apply(transformersDict.get('MIS_X_Statistics'), axis=1)

sallaryMisDf

Unnamed: 0,Salary,GPA,MIS,Statistics,misXStatistics,misXStatistics1
0,72,3.53,1,0,0,0.0
1,66,2.86,1,0,0,0.0
2,72,3.69,0,0,0,0.0
3,63,3.24,0,0,0,0.0
4,65,3.21,0,0,0,0.0
...,...,...,...,...,...,...
115,66,3.27,0,0,0,0.0
116,63,2.86,1,0,0,0.0
117,78,3.04,1,1,1,1.0
118,64,2.99,0,0,0,0.0


In [17]:
salaryBasedOnGpaMisStatistics_Transfoms_misXStatistics = sm.OLS(
  sallaryMisDf["Salary"],
  sm.add_constant(
      sallaryMisDf[[
          "GPA",
          "MIS",
          "Statistics",
          "misXStatistics1"
      ]]
  )
)

In [18]:
salaryBasedOnGpaMisStatistics_Transfoms_misXStatisticsFit = salaryBasedOnGpaMisStatistics_Transfoms_misXStatistics.fit()

In [19]:
from functions.exportModel import exportModel
exportModel({
    "modelName": "salaryBasedOnGpaMisStatistics_Transfoms_misXStatisticsFit",
    "model": salaryBasedOnGpaMisStatistics_Transfoms_misXStatisticsFit,
    "description": "Predict Salary based on GPA MIS Statistics and interaction MIS * Statistics for sallaryMisDf",
    "modelType": "sm.OLS",
    "baseRelativePath": "..",
    "inputs": [
        {
            "name": "const",
            "type": "int"
        },
        {
            "name": "GPA",
            "type": "float"
        },
        {
            "name": "MIS",
            "type": "binary"
        },
        {
            "name": "Statistics",
            "type": "binary"
        }
    ],
    "transformers":[
        {
            "name": "misXStatistics",
            "transformer": "MIS_X_Statistics"
        }
    ],
    "output": {
        "name": "Salary",
        "type": "int"
    }
})

<statsmodels.regression.linear_model.RegressionResultsWrapper at 0x3473d1e20>

In [20]:
print(salaryBasedOnGpaMisStatistics_Transfoms_misXStatisticsFit.summary())

                            OLS Regression Results                            
Dep. Variable:                 Salary   R-squared:                       0.810
Model:                            OLS   Adj. R-squared:                  0.803
Method:                 Least Squares   F-statistic:                     122.2
Date:                Sun, 09 Jun 2024   Prob (F-statistic):           1.87e-40
Time:                        01:24:53   Log-Likelihood:                -296.63
No. Observations:                 120   AIC:                             603.3
Df Residuals:                     115   BIC:                             617.2
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
const              44.0993      1.803     

In [21]:
# sallaryMisDf['misXGpa'] = sallaryMisDf.apply(lambda row: row['MIS'] * row['GPA'], axis=1)
sallaryMisDf['misXGpa'] = sallaryMisDf.apply(transformersDict.get('MIS_X_GPA'), axis=1)

sallaryMisDf

Unnamed: 0,Salary,GPA,MIS,Statistics,misXStatistics,misXStatistics1,misXGpa
0,72,3.53,1,0,0,0.0,3.53
1,66,2.86,1,0,0,0.0,2.86
2,72,3.69,0,0,0,0.0,0.00
3,63,3.24,0,0,0,0.0,0.00
4,65,3.21,0,0,0,0.0,0.00
...,...,...,...,...,...,...,...
115,66,3.27,0,0,0,0.0,0.00
116,63,2.86,1,0,0,0.0,2.86
117,78,3.04,1,1,1,1.0,3.04
118,64,2.99,0,0,0,0.0,0.00


In [22]:
salaryBasedOnGpaMisStatistics_Transfoms_misXGpa = sm.OLS(
  sallaryMisDf["Salary"],
  sm.add_constant(
      sallaryMisDf[[
          "GPA",
          "MIS",
          "Statistics",
          "misXGpa"
      ]]
  )
)

In [23]:
salaryBasedOnGpaMisStatistics_Transfoms_misXGpaFit = salaryBasedOnGpaMisStatistics_Transfoms_misXGpa.fit()

In [24]:
from functions.exportModel import exportModel
exportModel({
    "modelName": "salaryBasedOnGpaMisStatistics_Transfoms_misXGpaFit",
    "model": salaryBasedOnGpaMisStatistics_Transfoms_misXGpaFit,
    "description": "Predict Salary based on GPA MIS Statistics and interaction misXGpa for sallaryMisDf",
    "modelType": "sm.OLS",
    "baseRelativePath": "..",
    "inputs": [
        {
            "name": "const",
            "type": "int"
        },
        {
            "name": "GPA",
            "type": "float"
        },
        {
            "name": "MIS",
            "type": "binary"
        },
        {
            "name": "Statistics",
            "type": "binary"
        }
    ],
    "transformers":[
        {
            "name": "misXGpa",
            "transformer": "MIS_X_GPA"
        }
    ],
    "output": {
        "name": "Salary",
        "type": "int"
    }
})

<statsmodels.regression.linear_model.RegressionResultsWrapper at 0x3473f2a20>

In [25]:
print(salaryBasedOnGpaMisStatistics_Transfoms_misXGpaFit.summary())

                            OLS Regression Results                            
Dep. Variable:                 Salary   R-squared:                       0.795
Model:                            OLS   Adj. R-squared:                  0.788
Method:                 Least Squares   F-statistic:                     111.8
Date:                Sun, 09 Jun 2024   Prob (F-statistic):           1.11e-38
Time:                        01:24:53   Log-Likelihood:                -300.91
No. Observations:                 120   AIC:                             611.8
Df Residuals:                     115   BIC:                             625.8
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         44.1653      2.307     19.142      0.0

In [26]:
# sallaryMisDf['statisticsXGpa'] = sallaryMisDf.apply(lambda row: row['Statistics'] * row['GPA'], axis=1)
sallaryMisDf['statisticsXGpa'] = sallaryMisDf.apply(transformersDict.get('GPA_X_Statistics'), axis=1)

sallaryMisDf

Unnamed: 0,Salary,GPA,MIS,Statistics,misXStatistics,misXStatistics1,misXGpa,statisticsXGpa
0,72,3.53,1,0,0,0.0,3.53,0.00
1,66,2.86,1,0,0,0.0,2.86,0.00
2,72,3.69,0,0,0,0.0,0.00,0.00
3,63,3.24,0,0,0,0.0,0.00,0.00
4,65,3.21,0,0,0,0.0,0.00,0.00
...,...,...,...,...,...,...,...,...
115,66,3.27,0,0,0,0.0,0.00,0.00
116,63,2.86,1,0,0,0.0,2.86,0.00
117,78,3.04,1,1,1,1.0,3.04,3.04
118,64,2.99,0,0,0,0.0,0.00,0.00


In [27]:
salaryBasedOnGpaMisStatistics_Transfoms_statisticsXGpa = sm.OLS(
  sallaryMisDf["Salary"],
  sm.add_constant(
      sallaryMisDf[[
          "GPA",
          "MIS",
          "Statistics",
          "statisticsXGpa"
      ]]
  )
)

In [28]:
salaryBasedOnGpaMisStatistics_Transfoms_statisticsXGpaFit = salaryBasedOnGpaMisStatistics_Transfoms_statisticsXGpa.fit()

In [29]:
from functions.exportModel import exportModel
exportModel({
    "modelName": "salaryBasedOnGpaMisStatistics_Transfoms_statisticsXGpaFit",
    "model": salaryBasedOnGpaMisStatistics_Transfoms_statisticsXGpaFit,
    "description": "Predict Salary based on GPA MIS Statistics and interaction misXGpa for statisticsXGpa",
    "modelType": "sm.OLS",
    "baseRelativePath": "..",
    "inputs": [
        {
            "name": "const",
            "type": "int"
        },
        {
            "name": "GPA",
            "type": "float"
        },
        {
            "name": "MIS",
            "type": "binary"
        },
        {
            "name": "Statistics",
            "type": "binary"
        }
    ],
    "transformers":[
        {
            "name": "statisticsXGpa",
            "transformer": "GPA_X_Statistics"
        }
    ],
    "output": {
        "name": "Salary",
        "type": "int"
    }
})

<statsmodels.regression.linear_model.RegressionResultsWrapper at 0x3473f1040>

In [30]:
print(salaryBasedOnGpaMisStatistics_Transfoms_statisticsXGpaFit.summary())

                            OLS Regression Results                            
Dep. Variable:                 Salary   R-squared:                       0.803
Model:                            OLS   Adj. R-squared:                  0.796
Method:                 Least Squares   F-statistic:                     116.9
Date:                Sun, 09 Jun 2024   Prob (F-statistic):           1.44e-39
Time:                        01:24:53   Log-Likelihood:                -298.78
No. Observations:                 120   AIC:                             607.6
Df Residuals:                     115   BIC:                             621.5
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
const             41.2856      2.267     18.

In [31]:
salaryBasedOnGpaMisStatistics_Transfoms_misXStatistics_misXGpa_statisticsXGpa = sm.OLS(
  sallaryMisDf["Salary"],
  sm.add_constant(
      sallaryMisDf[[
          "GPA",
          "MIS",
          "Statistics",
          "misXStatistics",
          "misXGpa",
          "statisticsXGpa"
      ]]
  )
)

In [32]:
salaryBasedOnGpaMisStatistics_Transfoms_misXStatistics_misXGpa_statisticsXGpaFit = salaryBasedOnGpaMisStatistics_Transfoms_misXStatistics_misXGpa_statisticsXGpa.fit()

In [33]:
from functions.exportModel import exportModel
exportModel({
    "modelName": "salaryBasedOnGpaMisStatistics_Transfoms_misXStatistics_misXGpa_statisticsXGpaFit",
    "model": salaryBasedOnGpaMisStatistics_Transfoms_misXStatistics_misXGpa_statisticsXGpaFit,
    "description": "Predict Salary based on GPA MIS Statistics and interaction misXStatistics, misXGpa, statisticsXGpa",
    "modelType": "sm.OLS",
    "baseRelativePath": "..",
    "inputs": [
        {
            "name": "const",
            "type": "int"
        },
        {
            "name": "GPA",
            "type": "float"
        },
        {
            "name": "MIS",
            "type": "binary"
        },
        {
            "name": "Statistics",
            "type": "binary"
        }
    ],
    "transformers":[
        {
            "name": "misXStatistics",
            "transformer": "MIS_X_Statistics"
        },
        {
            "name": "misXGpa",
            "transformer": "MIS_X_GPA"
        },
        {
            "name": "statisticsXGpa",
            "transformer": "GPA_X_Statistics"
        }
    ],
    "output": {
        "name": "Salary",
        "type": "int"
    }
})

<statsmodels.regression.linear_model.RegressionResultsWrapper at 0x34741ec30>

In [34]:
print(salaryBasedOnGpaMisStatistics_Transfoms_misXStatistics_misXGpa_statisticsXGpaFit.summary())

                            OLS Regression Results                            
Dep. Variable:                 Salary   R-squared:                       0.815
Model:                            OLS   Adj. R-squared:                  0.805
Method:                 Least Squares   F-statistic:                     83.09
Date:                Sun, 09 Jun 2024   Prob (F-statistic):           4.15e-39
Time:                        01:24:53   Log-Likelihood:                -294.81
No. Observations:                 120   AIC:                             603.6
Df Residuals:                     113   BIC:                             623.1
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
const             41.7092      2.481     16.