#!/usr/bin/env python # coding: utf-8 # In[97]: import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns import statsmodels.api as sm from statsmodels.formula.api import ols # In[235]: df = pd.read_csv("apartment_for_rent_train.csv") # In[236]: df.info() # ### Missing Values and Outlier Removal # In[237]: df.isna().sum() # In[238]: df.dropna(subset=['Monthly_EUR_Price'], inplace=True) # In[239]: sns.boxplot(data=df, y = 'Monthly_EUR_Price') # In[246]: df.Monthly_EUR_Price.describe() # In[247]: upper = np.percentile(df.Monthly_EUR_Price, 75)+1.5*(np.percentile(df.Monthly_EUR_Price, 75)-np.percentile(df.Monthly_EUR_Price, 25)) upper #upper outlier threshold # In[248]: df1 = df[df['Monthly_EUR_Price'] 6: df_cat1.drop(var,axis=1,inplace=True) df_cat1.info() # In[256]: catcols1 = df_cat1.columns plt.figure(figsize=(12, 18)) for i, var in enumerate(catcols1[:8]): plt.subplot(4,2,i+1) sns.boxplot(data=df_cat1, y = 'Monthly_EUR_Price', x = var) plt.tight_layout() plt.title('Distribution of Price by '+var) plt.show # In[310]: best = ['Construction_type','New_construction'] plt.figure(figsize=(8, 4)) for i, var in enumerate(best): plt.subplot(1,2,i+1) sns.boxplot(data=df_cat1, y = 'Monthly_EUR_Price', x = var) plt.tight_layout() plt.title('Distribution of Price by '+var) plt.show # In[257]: df_cat2 = df_cat.copy() for var in cat_cols[:14]: if (df[var].nunique()) > 20: df_cat2.drop(var,axis=1,inplace=True) if (df[var].nunique()) < 7: df_cat2.drop(var,axis=1,inplace=True) df_cat2.info() # In[258]: catcols2 = df_cat2.columns plt.figure(figsize=(18, 18)) for i, var in enumerate(catcols2[:4]): plt.subplot(2,2,i+1) sns.boxplot(data=df_cat2, y = 'Monthly_EUR_Price', x = var) plt.tight_layout() plt.title('Distribution of Price by '+var) plt.show # In[314]: plt.figure(figsize=(12, 4)) sns.boxplot(data=df_cat2, y = 'Monthly_EUR_Price', x = 'Renovation') # In[259]: df_cat3 = df_cat.copy() for var in cat_cols[:14]: if (df[var].nunique()) < 7: df_cat3.drop(var,axis=1,inplace=True) df_cat3.info() # In[260]: for var in (df_cat3.columns[:6]): other_levs = df_cat3[var].value_counts().index.tolist()[5:df_cat3[var].value_counts().shape[0]] df_cat3[var] = np.where(df_cat3[var].isin(other_levs), 'Other', df_cat3[var]) # In[261]: for var in (df_cat3.columns[:6]): print(df_cat3[var].value_counts()) # In[262]: plt.figure(figsize=(18, 18)) for i, var in enumerate(df_cat3.columns[:6]): plt.subplot(3,2,i+1) sns.boxplot(data=df_cat3, y = 'Monthly_EUR_Price', x = var) plt.tight_layout() plt.title('Distribution of Price by '+var) plt.show # In[263]: anova_df = pd.concat([df_cat1.drop(columns=['Monthly_EUR_Price']), df_cat3], axis=1) anova_df.info() # In[264]: an_df = pd.DataFrame({ 'Price':anova_df['Monthly_EUR_Price'] }) # In[266]: for var in anova_df.columns[:14]: an_df['IV']=anova_df[var] anv = ols('Price ~ C(IV)', data=an_df).fit() anv_tbl = sm.stats.anova_lm(anv, typ=2) print(var) print(anv_tbl) # In[267]: anova_df.Datetime.value_counts() # In[268]: Dec22 = anova_df['Datetime'] == '15/12/2022' Nov22 = anova_df['Datetime'] == '24/11/2022' Jan23 = anova_df['Datetime'] == '07/01/2023' # In[269]: cat_df = anova_df.drop(columns = ['Gender']) # In[270]: model_cat = ols('Monthly_EUR_Price~C(New_construction)+C(Construction_type)+C(Renovation)', data=anova_df).fit() print("Categoric Predictors") print(model_cat.summary()) # ### Correlation # In[271]: df_num = df1.select_dtypes(include=np.number) df_num = df_num.drop(columns=['Reg_id','Elevator','Children_are_welcome','Pets_allowed','Utility_payments']) df_num.info() # In[272]: round(df_num.corr(),4) # In[274]: model_num = ols('Monthly_EUR_Price ~ Floors_in_the_building+Floor_area+Number_of_rooms+Number_of_bathrooms+Ceiling_height+Floor', data=df_num).fit() print("Numeric Predictors Model") print(model_num.summary()) # ### Address and Price Increase # In[283]: df_nov = pd.read_csv('NovRent.csv') df_dec = pd.read_csv('DecRent.csv') df_jan = pd.read_csv('JanRent.csv') # In[284]: print(df_nov.shape) print(df_dec.shape) print(df_jan.shape) # In[285]: df_nov_dec = pd.merge(df_dec, df_nov, on=['Address'], how='inner') df_nov_dec.shape # In[287]: df_nov_dec['PriceChange']=(df_nov_dec['AverageDecRent']-df_nov_dec['AverageNovRent'])/df_nov_dec['AverageNovRent'] # In[288]: df_nov_dec.head() # In[289]: sns.boxplot(data = df_nov_dec, y = 'PriceChange') # In[290]: df_nov_dec['PriceChange'].describe() # In[323]: upperT = np.percentile(df_nov_dec['PriceChange'], 75)+1.5*(np.percentile(df_nov_dec['PriceChange'], 75)-np.percentile(df_nov_dec['PriceChange'], 25)) print(upperT) #upper outlier threshold lowerT = np.percentile(df_nov_dec['PriceChange'], 25)-1.5*(np.percentile(df_nov_dec['PriceChange'], 75)-np.percentile(df_nov_dec['PriceChange'], 25)) print(lowerT) # In[ ]: # In[324]: df_nov_dec = df_nov_dec[df_nov_dec['PriceChange']lowerT] sns.boxplot(data = df_nov_dec, y = 'PriceChange') # In[325]: df_nov_dec['PriceChange'].describe() # In[326]: df_nov_dec[df_nov_dec['PriceChange']==df_nov_dec['PriceChange'].min()] # In[327]: df_nov_dec[df_nov_dec['PriceChange']==df_nov_dec['PriceChange'].max()] # In[328]: df_dec_jan = pd.merge(df_dec, df_jan, on=['Address'], how='inner') df_dec_jan['PriceChange']=(df_dec_jan['AverageJanRent']-df_dec_jan['AverageDecRent'])/df_dec_jan['AverageDecRent'] print(df_dec_jan.shape) df_nov_dec.head() # In[292]: df_dec_jan['PriceChange'].describe() # In[329]: upperT = np.percentile(df_dec_jan['PriceChange'], 75)+1.5*(np.percentile(df_dec_jan['PriceChange'], 75)-np.percentile(df_dec_jan['PriceChange'], 25)) print(upperT) #upper outlier threshold lowerT = np.percentile(df_dec_jan['PriceChange'], 25)-1.5*(np.percentile(df_dec_jan['PriceChange'], 75)-np.percentile(df_dec_jan['PriceChange'], 25)) print(lowerT) df_dec_jan = df_dec_jan[df_dec_jan['PriceChange']lowerT] sns.boxplot(data = df_dec_jan, y = 'PriceChange') # In[330]: df_dec_jan['PriceChange'].describe() # In[293]: model_add = ols('Monthly_EUR_Price~C(Address)', data=anova_df).fit() print("Categoric Predictors") print(model_add.summary()) # In[302]: model_add_date = ols('Monthly_EUR_Price~C(Address)+C(Datetime)', data=anova_df).fit() print(model_add_date.summary()) # In[295]: model_add_full = ols('Monthly_EUR_Price~C(Address)', data=df1).fit() print("Categoric Predictors") print(model_add_full.summary()) # In[303]: model_add_full_date = ols('Monthly_EUR_Price~C(Datetime)+C(Address)', data=df1).fit() print(model_add_full_date.summary()) # In[304]: df5 = df1.copy() add_levs = df5['Address'].value_counts().index.tolist()[50:df5['Address'].value_counts().shape[0]] df5['Address51'] = np.where(df5['Address'].isin(add_levs), 'Other', df5['Address']) model_add51 = ols('Monthly_EUR_Price~C(Address51)', data=df5).fit() print("Categoric Predictors") print(model_add51.summary()) # In[305]: model_add51_date = ols('Monthly_EUR_Price~C(Datetime)+C(Address51)', data=df5).fit() print(model_add51_date.summary())