# data analysis and manipulation
import pandas as pd

# mathematical operations
import numpy as np

# data visualization
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# warnings
import warnings
warnings.filterwarnings('ignore')


df = pd.read_csv('kc_house_data.csv')
df


# data types
df.dtypes

id                 int64
date              object
price            float64
bedrooms           int64
bathrooms        float64
sqft_living        int64
sqft_lot           int64
floors           float64
waterfront         int64
view               int64
condition          int64
grade              int64
sqft_above         int64
sqft_basement      int64
yr_built           int64
yr_renovated       int64
zipcode            int64
lat              float64
long             float64
sqft_living15      int64
sqft_lot15         int64
dtype: object


# missing values
df.isnull().sum()

id               0
date             0
price            0
bedrooms         0
bathrooms        0
sqft_living      0
sqft_lot         0
floors           0
waterfront       0
view             0
condition        0
grade            0
sqft_above       0
sqft_basement    0
yr_built         0
yr_renovated     0
zipcode          0
lat              0
long             0
sqft_living15    0
sqft_lot15       0
dtype: int64


# duplicated values
df[df.duplicated(['id'], keep=False)]


df[df.duplicated(['id', 'date'], keep=False)]


# descriptive statistics
att_statistics = df.copy()
att_statistics.drop(columns = ['id', 'date', 'floors', 'waterfront',
                              'view', 'yr_built', 'yr_renovated',
                              'zipcode', 'lat', 'long'],
                    axis=1, inplace=True)

# measures of central tendency
mct_mean = pd.DataFrame(att_statistics.apply(np.mean)).T
mct_median =  pd.DataFrame(att_statistics.apply(np.median)).T

# measures of variability 
mov_min = pd.DataFrame(att_statistics.apply(min)).T
mov_q25 = pd.DataFrame(att_statistics.quantile(0.25)).T
mov_q50 = pd.DataFrame(att_statistics.quantile(0.50)).T
mov_q75 = pd.DataFrame(att_statistics.quantile(0.75)).T
mov_max = pd.DataFrame(att_statistics.apply(max)).T
mov_range = pd.DataFrame(att_statistics.apply(lambda x: x.max()-x.min())).T
mov_std = pd.DataFrame(att_statistics.apply(np.std)).T


# unique dataframe
statistics = pd.concat([mov_min, mov_q25, mov_q50, mov_q75, mov_max, mov_range,
                        mct_median, mct_mean, mov_std]).T.reset_index()
columns = ['attributes', 'min', '25%', '50%', '75%', 'max', 'range', 'median', 'mean', 'std']
statistics.columns = columns
statistics.set_index('attributes', inplace=True)

statistics


df.query('bedrooms >= 10')


# function to create a correlation matrix
def plot_corr(df, size=18):
    corr = df.corr()
    
    # generates a mask for the upper triangle
    mask = np.triu(np.ones_like(corr, dtype=bool))
    
    fig, ax = plt.subplots(figsize=(size, size))
    
    # custom colormap palette
    cmap = sns.diverging_palette(230, 20, as_cmap=True)

    # draws the heatmap with the mask
    sns.heatmap(corr, mask=mask, cmap=cmap, vmax=1, vmin=-1, center=0, annot_kws={"fontsize": 10},
                annot=True, square=True, linewidths=.5, cbar_kws={"shrink": .5})
    
    # labels properties
    plt.xticks(range(len(corr.columns)), corr.columns, fontsize='11')
    plt.yticks(range(len(corr.columns)), corr.columns, fontsize='11')


# creating a correlation plot
plot_corr(df)


# filters the data of interest into a new dataframe
df_region = df.copy()
df_region.drop(columns=['bedrooms', 'bathrooms', 'sqft_living', 'floors',
                       'waterfront', 'view', 'sqft_above', 'sqft_basement',
                       'yr_built', 'yr_renovated', 'lat', 'long', 'sqft_lot15',
                       'sqft_living15', 'sqft_lot', 'date'],
               axis=1, inplace=True)

# calculates the median prices by region into a new dataframe
zip_median = df_region.groupby(['zipcode']).median().drop(columns=['id', 
                                                                   'condition',
                                                                   'grade'])
zip_median = zip_median.rename(columns={'price': 'zip_median'})

zip_median


# merges dataframes
df_median_region = pd.merge(df_region, zip_median, on='zipcode',
                            how='inner', sort=True)

df_median_region.set_index('zipcode', inplace=True, append=True, drop=True)
df_median_region.columns.names = ['attributes']

# iterates over the merged dataframe and displays houses status 
for i in df_median_region.index.values:
    if df_median_region.loc[i, 'price'] < df_median_region.loc[i, 'zip_median'] and df_median_region.loc[i, 'condition'] >= 3 and df_median_region.loc[i, 'grade'] >= 5:
        df_median_region.loc[i, 'status'] = 'Buy'
   
    elif df_median_region.loc[i, 'price'] <= df_median_region.loc[i, 'zip_median']*1.05 and df_median_region.loc[i, 'condition'] >= 3 and df_median_region.loc[i, 'grade'] >= 9: 
            df_median_region.loc[i, 'status'] = 'On Hold'
    
    else:
        df_median_region.loc[i, 'status'] = 'Not to buy'

# prints final table
df_median_region


# checks status totals
stts_buy = df_median_region[df_median_region.status == 'Buy']
stts_not_buy = df_median_region[df_median_region.status == 'Not to buy']
stts_hold = df_median_region[df_median_region.status == 'On Hold']

print('Buying:               %d' %stts_buy.status.count(), 
      '\nNot buying:           %d' %stts_not_buy.status.count(), 
      '\nHolding Analysis:       %d' %stts_hold.status.count(),
      '\n===========================',
      '\nTotal houses:         %d' %(stts_buy.status.count() +
                                     stts_not_buy.status.count() +
                                     stts_hold.status.count()))

Buying:               10554 
Not buying:           10899 
Holding Analysis:       160 
=========================== 
Total houses:         21613


# filters the data of interest into a new dataframe
df_season = df.copy()
df_season.drop(columns=['bedrooms', 'bathrooms', 'sqft_living', 'floors',
                       'waterfront', 'view', 'sqft_above', 'sqft_basement',
                       'yr_built', 'yr_renovated', 'lat', 'long', 'sqft_lot15',
                       'sqft_living15', 'sqft_lot'],
               axis=1, inplace=True)

# creates a dataset for days and months
seasn_day = pd.DatetimeIndex(df_season['date']).day
seasn_month = pd.DatetimeIndex(df_season['date']).month
seasn = pd.DataFrame(data={'day': seasn_day, 'month': seasn_month})

df_season


# creates a column to classify dates into seasons
for i in range(len(seasn)):
    if seasn.loc[i, 'month'] in range(1,3):
        seasn.loc[i, 'season'] = 'Winter'
    
    elif seasn.loc[i, 'month'] == 3:
        if seasn.loc[i, 'day'] in range(1, 21):
            seasn.loc[i, 'season'] = 'Winter'
        else:
            seasn.loc[i, 'season'] = 'Spring'
    
    elif seasn.loc[i, 'month'] in range(4, 6):
        seasn.loc[i, 'season'] = 'Spring'
    
    elif seasn.loc[i, 'month'] == 6:
        if seasn.loc[i, 'day'] in range(1, 21):
            seasn.loc[i, 'season'] = 'Spring'
        else:
            seasn.loc[i, 'season'] = 'Summer'
    
    elif seasn.loc[i, 'month'] in range(7, 9):
        seasn.loc[i, 'season'] = 'Summer'
        
    elif seasn.loc[i, 'month'] == 9:
        if seasn.loc[i, 'day'] in range(1, 22):
            seasn.loc[i, 'season'] = 'Summer'
        else:
            seasn.loc[i, 'season'] = 'Fall'
    
    elif seasn.loc[i, 'month'] in range(10, 12):
        seasn.loc[i, 'season'] = 'Fall'

    elif seasn.loc[i, 'month'] == 12:
        if seasn.loc[i, 'day'] in range(1, 21):
            seasn.loc[i, 'season'] = 'Fall'
        else:
            seasn.loc[i, 'season'] = 'Winter'

seasn


# turns column "date" into "season" 
df_season['date'] = seasn.season
df_season = df_season.rename(columns={'date': 'season'})

df_season


#pd.set_option('display.max_rows', None)

# calculates median for each season and zipcode
seasn_median = df_season.groupby(['zipcode',
                                  'season']).median().drop(columns=['id', 
                                                                    'condition',
                                                                    'grade'])
seasn_median = seasn_median.rename(columns={'price': 'seasn_median'})

# counts total house sample
houses_totals = df_season.groupby(['zipcode',
                                   'season']).count().drop(columns=['price', 
                                                                    'condition',
                                                                    'grade'])
houses_totals = houses_totals.rename(columns={'id': 'house_totals'})

# merges median and count into a unique dataframe
df_median_seasn = pd.merge(seasn_median, houses_totals,
                           on=['zipcode', 'season'],
                            how='inner')

df_median_seasn


# filters season with max median for each zipcode
seasn_max = seasn_median.groupby(['zipcode']).max()

seasn_max = pd.merge(seasn_max, df_median_seasn.reset_index(),
                           on=['zipcode', 'seasn_median'],
                            how='inner').drop(columns=['house_totals'])

# grouping seasn_max by zipcode to avoid duplicated data (zipcodes with two equal seasons max medians)
seasn_max =seasn_max.groupby(['zipcode']).first().reset_index()

seasn_max = seasn_max.rename(columns={'seasn_median': 'max_median_seasn'})

seasn_max


# merges dataframes
df_seasn_zip = pd.merge(df_season, df_median_seasn, 
                        on=['zipcode', 'season'],
                        how='left', sort=True)

df_seasn_zip = pd.merge(df_seasn_zip, seasn_max, 
                        on=['zipcode'],
                        how='inner', suffixes=('','_of_max_median'))

# reorders columns
df_seasn_zip = pd.DataFrame(data=[df_seasn_zip['zipcode'], df_seasn_zip['id'],
                           df_seasn_zip['house_totals'], df_seasn_zip['season'],
                           df_seasn_zip['seasn_median'], df_seasn_zip['price'],
                           df_seasn_zip['season_of_max_median'],
                           df_seasn_zip['max_median_seasn'],
                           df_seasn_zip['condition'], df_seasn_zip['grade']]).T

# sets up dataframe
df_seasn_zip.set_index('zipcode', inplace=True, append=True, drop=True)
df_seasn_zip.columns.names = ['attributes']

df_seasn_zip


# iterates over the merged dataframe and displays houses status 
for i in df_seasn_zip.index.values:
    if df_seasn_zip.loc[i, 'condition'] >= 3 and df_seasn_zip.loc[i, 'grade'] >= 5:
    
        if df_seasn_zip.loc[i, 'price'] < df_seasn_zip.loc[i, 'seasn_median']:
            df_seasn_zip.loc[i, 'status'] = 'Buy'
    
        elif df_seasn_zip.loc[i, 'price'] >= df_seasn_zip.loc[i, 'seasn_median'] and df_seasn_zip.loc[i, 'price'] < df_seasn_zip.loc[i, 'max_median_seasn']: 
            df_seasn_zip.loc[i, 'status'] = 'Buy'
        
        elif df_seasn_zip.loc[i, 'price'] >= df_seasn_zip.loc[i, 'max_median_seasn']:
            df_seasn_zip.loc[i, 'status'] = 'Not to buy'

    elif df_seasn_zip.loc[i, 'price'] <= df_seasn_zip.loc[i, 'seasn_median']*0.60 and df_seasn_zip.loc[i, 'condition'] >= 3 and df_seasn_zip.loc[i, 'grade'] >= 4: 
            df_seasn_zip.loc[i, 'status'] = 'Buy to renovate'
    
    else:
        df_seasn_zip.loc[i, 'status'] = 'Not to buy'

# prints final table
df_seasn_zip


# checks status totals
stts_buy_seasn = df_seasn_zip[df_seasn_zip.status == 'Buy']
stts_not_buy_seasn = df_seasn_zip[df_seasn_zip.status == 'Not to buy']
stts_renovate_seasn = df_seasn_zip[df_seasn_zip.status == 'Buy to renovate']

print('Buying:               %d' %stts_buy_seasn.status.count(), 
      '\nNot buying:            %d' %stts_not_buy_seasn.status.count(), 
      '\nBuying to renovate:      %d' %stts_renovate_seasn.status.count(),
      '\n===========================',
      '\nTotal houses:         %d' %(stts_buy_seasn.status.count() +
                                     stts_not_buy_seasn.status.count() +
                                     stts_renovate_seasn.status.count()))

Buying:               11840 
Not buying:            9759 
Buying to renovate:      14 
=========================== 
Total houses:         21613


houses_to_buy = df_seasn_zip.query("status == 'Buy'")

for i in houses_to_buy.index.values:
    if houses_to_buy.loc[i, 'price'] < houses_to_buy.loc[i, 'seasn_median']*0.85:
        houses_to_buy.loc[i, 'sell_season'] = houses_to_buy.loc[i, 'season']
        houses_to_buy.loc[i, 'sell_price'] = houses_to_buy.loc[i, 'price']*1.30
        houses_to_buy.loc[i, 'profit'] = houses_to_buy.loc[i, 'sell_price'] - houses_to_buy.loc[i, 'price'] 
    
    elif houses_to_buy.loc[i, 'price'] >= houses_to_buy.loc[i, 'seasn_median']*0.85 and houses_to_buy.loc[i, 'price'] <= houses_to_buy.loc[i, 'seasn_median']*1.05:
        houses_to_buy.loc[i, 'sell_season'] = houses_to_buy.loc[i, 'season']
        houses_to_buy.loc[i, 'sell_price'] = houses_to_buy.loc[i, 'price']*1.15
        houses_to_buy.loc[i, 'profit'] = houses_to_buy.loc[i, 'sell_price'] - houses_to_buy.loc[i, 'price'] 
    
    elif houses_to_buy.loc[i, 'price'] > houses_to_buy.loc[i, 'seasn_median']*1.05 and houses_to_buy.loc[i, 'price'] < houses_to_buy.loc[i, 'max_median_seasn']*0.85:
        houses_to_buy.loc[i, 'sell_season'] = houses_to_buy.loc[i, 'season_of_max_median']
        houses_to_buy.loc[i, 'sell_price'] = houses_to_buy.loc[i, 'price']*1.30
        houses_to_buy.loc[i, 'profit'] = houses_to_buy.loc[i, 'sell_price'] - houses_to_buy.loc[i, 'price'] 
        
    elif houses_to_buy.loc[i, 'price'] >= houses_to_buy.loc[i, 'max_median_seasn']*0.85:
        houses_to_buy.loc[i, 'sell_season'] = houses_to_buy.loc[i, 'season_of_max_median']
        houses_to_buy.loc[i, 'sell_price'] = houses_to_buy.loc[i, 'price']*1.15
        houses_to_buy.loc[i, 'profit'] = houses_to_buy.loc[i, 'sell_price'] - houses_to_buy.loc[i, 'price'] 

houses_to_buy = houses_to_buy.drop(columns=['house_totals', 'seasn_median',
                                            'season_of_max_median', 'grade',
                                            'max_median_seasn', 'condition',
                                            'status'])

houses_to_buy


houses_to_renovate = df_seasn_zip.query("status == 'Buy to renovate'")

for i in houses_to_renovate.index.values:
    if houses_to_renovate.loc[i, 'price'] <= houses_to_renovate.loc[i, 'seasn_median']*0.50:
        houses_to_renovate.loc[i, 'max_renovation_outlay'] = houses_to_renovate.loc[i, 'price']*0.40
        houses_to_renovate.loc[i, 'sell_season'] = houses_to_renovate.loc[i, 'season_of_max_median']
        houses_to_renovate.loc[i, 'sell_price'] = houses_to_renovate.loc[i, 'price']*1.40*1.60
        houses_to_renovate.loc[i, 'min_profit'] = houses_to_renovate.loc[i, 'sell_price'] - houses_to_renovate.loc[i, 'price']
    
    elif houses_to_renovate.loc[i, 'price'] > houses_to_renovate.loc[i, 'seasn_median']*0.50:
        houses_to_renovate.loc[i, 'max_renovation_outlay'] = houses_to_renovate.loc[i, 'price']*0.40
        houses_to_renovate.loc[i, 'sell_season'] = houses_to_renovate.loc[i, 'season_of_max_median']
        houses_to_renovate.loc[i, 'sell_price'] = houses_to_renovate.loc[i, 'price']*1.40*1.40
        houses_to_renovate.loc[i, 'min_profit'] = houses_to_renovate.loc[i, 'sell_price'] - houses_to_renovate.loc[i, 'max_renovation_outlay'] - houses_to_renovate.loc[i, 'price']

houses_to_renovate = houses_to_renovate.drop(columns=['house_totals', 'seasn_median', 
                                                      'season_of_max_median', 'max_median_seasn',
                                                      'condition', 'grade', 'status'])

houses_to_renovate


# figure size
fig = plt.figure(figsize=(9,6), dpi=80)

# creating bars
ax = sns.barplot(data=df, x='waterfront', y='price',
                 palette='pink_r')

# setting up style
sns.set_theme(style='ticks')
sns.despine(top=True, bottom=True, trim=True, offset=10)
plt.grid(axis='y', linestyle='--')

# setting up axis and title
plt.title('Average Prices for Houses either Overlooking or Not the Waterfront', fontsize=14, fontweight='bold')
plt.xlabel('')
plt.ylabel('Price', fontsize=12, fontweight='bold')
ax.set_xticklabels(['Not Overlooking', 'Overlooking'])
plt.rcParams['xtick.labelsize'] = 10
plt.rcParams['ytick.labelsize'] = 10

# saving the plot
ax.get_yaxis().get_major_formatter().set_scientific(False)
plt.savefig("plots/average-prices-for-houses-either-overlooking-or-not-the-waterfront.png", dpi=300, bbox_inches='tight')
plt.show()


# creates variables for both cases 
wtfront = df.loc[df['waterfront'] == 1]
not_wtfront = df.loc[df['waterfront'] == 0]

prices_wtfront = wtfront['price']
prices_not_wtfront = not_wtfront['price']

# displays average prices for both cases
wtfront_mean = prices_wtfront.mean()
not_wtfront_mean = prices_not_wtfront.mean()

print('Average prices for houses:'
     '\n\nOverlooking waterfront      >> U$ %.2f' %wtfront_mean,
     '\nNOT overlooking waterfront  >> U$ %.2f' %not_wtfront_mean)

# calculates percentage
percent_wtfront = (abs(wtfront_mean-not_wtfront_mean)/not_wtfront_mean)*100

# displays comparison
print('\nResults:',
      '\n\nHouses overlooking the waterfront are %.2f%% more expensive' %percent_wtfront,
      '\nthan houses not overlooking the waterfront, on average.')

Average prices for houses:

Overlooking waterfront      >> U$ 1661876.02 
NOT overlooking waterfront  >> U$ 531563.60

Results: 

Houses overlooking the waterfront are 212.64% more expensive 
than houses not overlooking the waterfront, on average.


# creates variables 
prior_1995 = df.loc[df['yr_built'] < 1995]
after_1995 = df.loc[df['yr_built'] >= 1995]

prices_prior_1995 = pd.DataFrame(prior_1995.price)
prices_after_1995 = pd.DataFrame(after_1995.price)

# points whether the house was built prior or after 1995
prices_prior_1995['prior_1995'] = 1
prices_after_1995['prior_1995'] = 0

# concat Dataframes into a unique
df_yr_built = pd.concat([prices_prior_1995, prices_after_1995], axis=0, join='outer')
columns = ['price', 'prior_1995']
df_yr_built.columns = columns


# figure size
fig = plt.figure(figsize=(9,6), dpi=80)

# creating bars
ax = sns.barplot(data=df_yr_built, x='prior_1995', y='price',
                 palette='pink_r')

# setting up style
sns.set_theme(style='ticks')
sns.despine(top=True, bottom=True, trim=True, offset=10)
plt.grid(axis='y', linestyle='--')

# setting up axis and title
plt.title('Average Prices for both Houses built prior and after 1995', fontsize=14, fontweight='bold')
plt.xlabel('')
plt.ylabel('Price', fontsize=12, fontweight='bold')
ax.set_xticklabels(['After 1995', 'Prior to 1995'])
plt.rcParams['xtick.labelsize'] = 10
plt.rcParams['ytick.labelsize'] = 10

# saving the plot
ax.get_yaxis().get_major_formatter().set_scientific(False)
plt.savefig("plots/average-prices-for-both-houses-built-prior-and-after-1995.png", dpi=300, bbox_inches='tight')
plt.show(fig)


# displays average prices for both cases
prior_1995_mean = prices_prior_1995.price.mean()
after_1995_mean = prices_after_1995.price.mean()

print('Average prices for houses:'
     '\n\nBuilt prior to 1995      >> U$ %.2f' %prior_1995_mean,
     '\nBuilt from 1995 onwards  >> U$ %.2f' %after_1995_mean)

# calculates percentage
percent_yr_built = (abs(prior_1995_mean-after_1995_mean)/after_1995_mean)*100

# displays comparison
print('\nResults:',
      '\n\nHouses built prior to 1995 are %.2f%% cheaper' %percent_yr_built,
      '\nthan houses built from 1995 onwards, on average.')

Average prices for houses:

Built prior to 1995      >> U$ 511660.72 
Built from 1995 onwards  >> U$ 617519.07

Results: 

Houses built prior to 1995 are 17.14% cheaper 
than houses built from 1995 onwards, on average.


# creates variables
baths_2 = df[df['bathrooms'] == 2]
baths_3 = df[df['bathrooms'] == 3]

prices_baths_2 = pd.DataFrame(baths_2.price)
prices_baths_3 = pd.DataFrame(baths_3['price'])

# points whether the house has 2 or 3 bathroomns
prices_baths_2['num_bath'] = 2
prices_baths_3['num_bath'] = 3

# concat Dataframes into a unique
df_num_bath = pd.concat([prices_baths_2, prices_baths_3], axis=0, join='outer')
columns = ['price', 'num_bath']
df_num_bath.columns = columns


# figure size
fig = plt.figure(figsize=(9,6), dpi=80)

# creating bars
ax = sns.barplot(data=df_num_bath, x='num_bath', y='price',
                 palette='pink_r')

# setting up style
sns.set_theme(style='ticks')
sns.despine(top=True, bottom=True, trim=True, offset=10)
plt.grid(axis='y', linestyle='--')

# creates variables
baths_2 = df[df['bathrooms'] == 2]
baths_3 = df[df['bathrooms'] == 3]

prices_baths_2 = pd.DataFrame(baths_2.price)
prices_baths_3 = pd.DataFrame(baths_3['price'])

# points whether the house has 2 or 3 bathroomns
prices_baths_2['num_bath'] = 2
prices_baths_3['num_bath'] = 3

# concat Dataframes into a unique
df_num_bath = pd.concat([prices_baths_2, prices_baths_3], axis=0, join='outer')
columns = ['price', 'num_bath']
df_num_bath.columns = columns

# setting up axis and title
plt.title('Average Prices for Houses with 2 and 3 Bathrooms', fontsize=14, fontweight='bold')
plt.xlabel('')
plt.ylabel('Price', fontsize=12, fontweight='bold')
ax.set_xticklabels(['2 Bathrooms', '3 Bathrooms'])
plt.rcParams['xtick.labelsize'] = 10
plt.rcParams['ytick.labelsize'] = 10

# saving the plot
ax.get_yaxis().get_major_formatter().set_scientific(False)
plt.savefig("plots/average-prices-for-houses-with-2-and-3-bathrooms.png", dpi=300, bbox_inches='tight')
plt.show(fig)


# displays average prices for both cases
baths_2_mean = prices_baths_2.price.mean()
baths_3_mean = prices_baths_3.price.mean()

print('Average prices for houses:'
     '\n\nWith 02 bathrooms   >> U$ %.2f' %baths_2_mean,
     '\nWith 03 bathrooms   >> U$ %.2f' %baths_3_mean)

# calculates percentage
percent_n_baths = (abs(baths_2_mean-baths_3_mean)/baths_3_mean)*100

# displays comparison
print('\nResults:',
      '\n\nHouses with 02 bathrooms are %.2f%% cheaper' %percent_n_baths,
      '\nthan houses with 03 bathrooms, on average.')

Average prices for houses:

With 02 bathrooms   >> U$ 457889.72 
With 03 bathrooms   >> U$ 708415.23

Results: 

Houses with 02 bathrooms are 35.36% cheaper 
than houses with 03 bathrooms, on average.


# creates variables
grade_9 = df[df['grade'] >= 9]
grade_7 = df[df['grade'] <= 7]

prices_grade_9 = pd.DataFrame(grade_9['price'])
prices_grade_7 = pd.DataFrame(grade_7.price)

# points whether the house is graded 9+ or 7-
prices_grade_9['grade_lim'] = 9
prices_grade_7['grade_lim'] = 7

# concat Dataframes into a unique
df_grade_lim = pd.concat([prices_grade_9, prices_grade_7], axis=0, join='outer')
columns = ['price', 'grade_lim']
df_grade_lim.columns = columns


# figure size
fig = plt.figure(figsize=(9,6), dpi=80)

# creating bars
ax = sns.barplot(data=df_grade_lim, x='grade_lim', y='price',
                 palette='pink_r')

# setting up style
sns.set_theme(style='ticks')
sns.despine(top=True, bottom=True, trim=True, offset=10)
plt.grid(axis='y', linestyle='--')

# setting up axis and title
plt.title('Average Prices for Houses Regarding their Grades', fontsize=14, fontweight='bold')
plt.xlabel('')
plt.ylabel('Price', fontsize=12, fontweight='bold')
ax.set_xticklabels(['Graded 7 or lower', 'Graded 9 or higher'])
plt.rcParams['xtick.labelsize'] = 10
plt.rcParams['ytick.labelsize'] = 10

# saving the plot
ax.get_yaxis().get_major_formatter().set_scientific(False)
plt.savefig("plots/average-prices-for-houses-regarding-their-grades.png", dpi=300, bbox_inches='tight')
plt.show(fig)


# displays average prices for both cases
grade_9_mean = prices_grade_9.price.mean()
grade_7_mean = prices_grade_7.price.mean()

print('Average prices for houses:'
     '\n\nWith grades 9 or higher   >> U$ %.2f' %grade_9_mean,
     '\nWith grades 7 or lower    >> U$ %.2f' %grade_7_mean)

# calculates percentage
percent_grade = (abs(grade_9_mean-grade_7_mean)/grade_7_mean)*100

# displays comparison
print('\nResults:',
      '\n\nHouses with grade 9 or higher are %.2f%% more expensive' %percent_grade,
      '\nthan houses with grades 7 or lower, on average.')

Average prices for houses:

With grades 9 or higher   >> U$ 959962.41 
With grades 7 or lower    >> U$ 380564.39

Results: 

Houses with grade 9 or higher are 152.25% more expensive 
than houses with grades 7 or lower, on average.


# creates new Dataframe
basement = df[df['sqft_basement'] != 0]
no_basement = df[df['sqft_basement'] == 0]

area_basement = pd.DataFrame(basement.sqft_lot)
area_no_basement = pd.DataFrame(no_basement.sqft_lot)

# points whether the house has basement or not
area_basement['bsmt'] = 1
area_no_basement['bsmt'] = 0

# concat Dataframes into a unique
df_basement = pd.concat([area_basement, area_no_basement], axis=0, join='outer')
columns = ['sqft_lot', 'bsmt']
df_basement.columns = columns


# figure size
fig = plt.figure(figsize=(8,6), dpi=80)

# creating bars
ax = sns.barplot(data=df_basement, x='bsmt', y='sqft_lot',
                 palette='pink_r')

# setting up style
sns.set_theme(style='ticks')
sns.despine(top=True, bottom=True, trim=True, offset=10)
plt.grid(axis='y', linestyle='--')

# setting up axis and title
plt.title('Average square footage for Houses with and without Basement', fontsize=14, fontweight='bold')
plt.xlabel('')
plt.ylabel('Square Footage', fontsize=12, fontweight='bold')
ax.set_xticklabels(['No Basement', 'Basement'])
plt.rcParams['xtick.labelsize'] = 10
plt.rcParams['ytick.labelsize'] = 10

# saving the plot
ax.get_yaxis().get_major_formatter().set_scientific(False)
plt.savefig("plots/average-square-footage-for-houses-with-and-without-basement.png", dpi=300, bbox_inches='tight')
plt.show()


# displays average total area for both cases
basement_mean = area_basement.sqft_lot.mean()
no_basement_mean = area_no_basement.sqft_lot.mean()

print('Average total square footage for houses:'
     '\n\nWith basement      >> %.2f sqft.' %basement_mean,
     '\nWithout basement   >> %.2f sqft.' %no_basement_mean)

# calculates percentage
percent_basements = (abs(basement_mean-no_basement_mean)/no_basement_mean)*100

# displays comparison
print('\nResults:',
      '\n\nHouses with basements are %.2f%% smaller' %percent_basements,
      '\nthan houses without basements, on average.')

Average total square footage for houses:

With basement      >> 13286.29 sqft. 
Without basement   >> 16284.18 sqft.

Results: 

Houses with basements are 18.41% smaller 
than houses without basements, on average.


# creates dataframe with dates and converts them to DateTime
date_df = pd.DataFrame(df.date)
date_df.date = pd.to_datetime(date_df['date'])

# creates a range of sorted dates starting on the earlier date of the dataset 
interval = pd.date_range(start=date_df.min().item(), periods=360)

# creates a list for monthly intervals of 30 days
j = 30
days = [day for day in interval[:j]]

# iniciates variables
prev_monthly_avg = 0
monthly_avg_list = []
variation_list = []
total_houses = []

# iterates over df and calculates mean for prices on the date range each month  
for period in range(12):
    monthly_prices = []
    
    for i in range(len(df)):
        if date_df.loc[i, 'date'] in days and df.loc[i, 'bathrooms'] == 3:
            monthly_prices.append(df.loc[i, 'price'])
    
    # calculates monthly average prices
    monthly_avg = sum(monthly_prices)/len(monthly_prices)
    monthly_avg_list.append(monthly_avg)
    total_houses.append(len(monthly_prices))
       
    if prev_monthly_avg > 0:
        # calculates variation
        variation = ((monthly_avg-prev_monthly_avg)/prev_monthly_avg)*100
        variation_list.append(variation)
    else:
        variation_list.append(0)
    
    prev_monthly_avg = monthly_avg
    days = [day for day in interval[j:j+30]]
    j += 30

# creates dataframe with records
MoM_variation = pd.DataFrame(data={'month': range(1, 13),
                                   'total_3_bathrooms': total_houses,
                                   'avg_prices': monthly_avg_list, 
                                   'MoM_variation (%)': variation_list})

MoM_variation.set_index(['month'], inplace=True)

MoM_variation.round(3)


MoM_variation.describe().round(2)


# figure size
fig = plt.figure(figsize=(15,8), dpi=80)
df.date = pd.to_datetime(df['date'])

# creating lines
ax = sns.lineplot(data=df.query('bathrooms == 3'),
                  x='date', y='price', hue='bathrooms',
                  palette='ch:l=.3,r=-.5', ci=None)

# setting up style
sns.set_style('white')
sns.despine(top=True, trim=True)

# setting up axis and title
plt.title('Price Variation over Time for Houses with 3 Bathrooms', fontsize=16, fontweight='bold')
plt.xlabel('Date', fontsize=13, fontweight='bold')
plt.ylabel('Price', fontsize=13, fontweight='bold')
plt.rcParams['xtick.labelsize'] = 10
plt.rcParams['ytick.labelsize'] = 10
ax.get_yaxis().get_major_formatter().set_scientific(False)

# limiting number of values on each axis
ax.xaxis.set_major_locator(plt.MaxNLocator(7))
ax.yaxis.set_major_locator(plt.MaxNLocator(6))

# saving the plot
plt.savefig("plots/price-variation-over-time-for-houses-with-3-bathrooms.png", dpi=300, bbox_inches='tight')
plt.show()


# figure size
fig = plt.figure(figsize=(20,14), dpi=80)

# creating scatters
ax = sns.stripplot(data=df, x="price", y="zipcode",
                   hue="condition", orient="h", 
                   palette="Blues")

# setting up style
sns.set_style('darkgrid')

# setting up axis and title
plt.title('Prices Distribution by Zipcode and Condition', fontsize=16, fontweight='bold')
plt.xlabel('Price', fontsize=12, fontweight='bold')
plt.ylabel('Zipcode', fontsize=12, fontweight='bold')
plt.rcParams['xtick.labelsize'] = 10
plt.rcParams['ytick.labelsize'] = 10
ax.get_xaxis().get_major_formatter().set_scientific(False)

# saving the plot
plt.savefig("plots/prices-distribution-by-zipcode-and-condition.png", dpi=300, bbox_inches='tight')
plt.show()


# figure size
fig = plt.figure(figsize=(20,14), dpi=80)

# creating scatters
ax = sns.stripplot(data=houses_to_buy.reset_index(), x="price", y="zipcode",
                   hue="season", hue_order=['Winter', 'Spring','Summer',  'Fall'], 
                   palette="pink", orient="h")

# setting up style
sns.set_style('darkgrid')

# setting up axis and title
plt.title('Prices Distribution for Houses to Buy', fontsize=16, fontweight='bold')
plt.xlabel('Price', fontsize=12, fontweight='bold')
plt.ylabel('Zipcode', fontsize=12, fontweight='bold')
plt.rcParams['xtick.labelsize'] = 10
plt.rcParams['ytick.labelsize'] = 10
ax.get_xaxis().get_major_formatter().set_scientific(False)

# saving the plot
plt.savefig("plots/prices-distribution-for-houses-to-buy.png", dpi=300, bbox_inches='tight')
plt.show()


# figure size
fig = plt.figure(figsize=(20,14), dpi=80)

# creating bars 
ax = sns.barplot(data=df_season, y="zipcode", x="price",
                 estimator=np.median, orient="h", ci=None,
                 palette="ch:r=-.3,g=1,l=.7")

# setting up style
sns.set_theme(style='darkgrid')

# setting up axis and title
plt.title('Median Price by Zipcode', fontsize=16, fontweight='bold')
plt.xlabel('Median Price', fontsize=12, fontweight='bold')
plt.ylabel('Zipcode', fontsize=12, fontweight='bold')
plt.rcParams['xtick.labelsize'] = 10
plt.rcParams['ytick.labelsize'] = 10
ax.get_xaxis().get_major_formatter().set_scientific(False)

# saving the plot
plt.savefig("plots/median-price-by-zipcode.png", dpi=300, bbox_inches='tight')
plt.show()


# figure size
fig = plt.figure(figsize=(20,14), dpi=80)

# creating bars 
ax = sns.countplot(data=df_seasn_zip.reset_index().query("status == 'Buy' or status == 'Not to buy'"),
                   y="zipcode", orient="h", dodge=False,
                   palette="bone_r", hue='status',
                   hue_order=['Buy', "Not to buy"])

# setting up style
sns.set_theme(style='whitegrid')
sns.despine(left=True, right=True, top=True, bottom=True)

# setting up axis and title
plt.title('Total Number of Houses to Buy', fontsize=16, fontweight='bold')
plt.xlabel('Total of houses', fontsize=12, fontweight='bold')
plt.ylabel('Zipcode', fontsize=12, fontweight='bold')
plt.rcParams['xtick.labelsize'] = 10
plt.rcParams['ytick.labelsize'] = 10

# saving the plot
plt.savefig("plots/total-number-of-houses-to-buy.png", dpi=300, bbox_inches='tight')
plt.show()


# figure size
fig = plt.figure(figsize=(16,8), dpi=80)

# creating bars
sns.barplot(data=houses_to_buy.reset_index(), estimator=np.mean,
            x='zipcode', y='profit', palette='ch:s=2,l=.6,r=1,d=.3')

# setting up style
sns.set_theme(style='ticks')
sns.despine(top=True, offset=10, trim=True)
plt.grid(axis='y', linestyle='--')

# setting up axis and title
plt.title('Average Profit on Houses to Buy by Region', fontsize=16, fontweight='bold')
plt.xlabel('Zipcode', fontsize=13, fontweight='bold')
plt.ylabel('Profit', fontsize=13, fontweight='bold')
plt.rcParams['xtick.labelsize'] = 9
plt.rcParams['ytick.labelsize'] = 9
plt.xticks(rotation=90)

# saving the plot
plt.savefig("plots/average-profit-on-houses-to-buy-by-region.png", dpi=300, bbox_inches='tight')
plt.show()


# figure size
fig = plt.figure(figsize=(12,8), dpi=80)
ax = fig.add_gridspec(2, 2) # creates grids

# creating bars
with sns.axes_style('white'):
    fig.add_subplot(ax[0, 0])
    sns.countplot(data=houses_to_buy,
                  x='sell_season',
                  palette='Greys_r',
                  order = houses_to_buy['sell_season'].value_counts().index)
    
    # setting up style
    sns.despine(top=True, bottom=True, offset=5)
    plt.grid(axis='y')

    # setting up axis and title
    plt.title('Houses to Buy', fontsize=14, fontweight='bold')
    plt.xlabel('Season to Sell', fontsize=12, fontweight='bold')
    plt.ylabel('Total Houses', fontsize=12, fontweight='bold')
    plt.rcParams['xtick.labelsize'] = 10
    plt.rcParams['ytick.labelsize'] = 10

# creating bars
with sns.axes_style('white'):
    fig.add_subplot(ax[0, 1])
    sns.countplot(data=houses_to_renovate,
                  x='sell_season',
                  palette='Greys_r',
                  order = houses_to_renovate['sell_season'].value_counts().index)
    
    # setting up style
    sns.despine(top=True, bottom=True, offset=5, trim=True)
    plt.grid(axis='y')
    
    # setting up axis and title
    plt.title('Houses to Renovate', fontsize=14, fontweight='bold')
    plt.xlabel('Season to Sell', fontsize=12, fontweight='bold')
    plt.ylabel('')
    plt.rcParams['xtick.labelsize'] = 10
    plt.rcParams['ytick.labelsize'] = 10

# plotting and saving the plot
plt.savefig("plots/total-sales-by-season.png", dpi=300, bbox_inches='tight')
fig.tight_layout()
plt.show()


# figure size
fig = plt.figure(figsize=(12,6), dpi=80)

# creating bars 
sns.set_color_codes("deep")
sns.barplot(x="sell_price", y="id", data=houses_to_renovate,
            label="Sell", color="#874c62", orient='h')

sns.set_color_codes("muted")
sns.barplot(x="price", y="id", data=houses_to_renovate,
            label="Purchase", color="#c292a1", orient='h')

sns.set_color_codes("deep")
sns.barplot(x="max_renovation_outlay", y="id", data=houses_to_renovate,
            label="Renovation", color="#c8aca9", orient='h')

# setting up style
sns.set_theme(style='white')
sns.despine(left=True, right=True, top=True, bottom=True)
plt.grid(axis='y')


# setting up axis and title
plt.title('Houses to Renovate Summary', fontsize=16, fontweight='bold')
plt.xlabel('Price', fontsize=13, fontweight='bold')
plt.ylabel('House ID', fontsize=13, fontweight='bold')
plt.rcParams['xtick.labelsize'] = 10
plt.rcParams['ytick.labelsize'] = 10
plt.legend()

# saving the plot
plt.savefig("plots/houses-to-renovate-summary.png", dpi=300, bbox_inches='tight')
plt.show()

Atributte	Type	Description
id	Integer	Unique ID for each home sold.
date	Object	Date of the home sale.
price	Float	Price of each home sold.
bedrooms	Integer	Number of bedrooms.
bathrooms	Float	Number of bathrooms, where .5 accounts for a room with a toilet but no shower.
sqft_living	Integer	Square footage of the apartments interior living space.
sqft_lot	Integer	Square footage of the land space.
floors	Float	Number of floors.
waterfront	Integer	A dummy variable for whether the apartment was overlooking the waterfront or not.
view	Integer	An index from 0 to 4 of how good the view of the property was.
condition	Integer	An index from 1 to 5 on the condition of the apartment.
grade	Integer	An index from 1 to 13, where 1-3 falls short of building construction and design, 7 has an average level of construction and design, and 11-13 have a high quality level of construction and design.
sqft_above	Integer	The square footage of the interior housing space that is above ground level.
sqft_basement	Integer	The square footage of the interior housing space that is below ground level.
yr_built	Integer	The year the house was initially built.
yr_renovated	Integer	The year of the house’s last renovation.
zipcode	Integer	What zipcode area the house is in.
lat	Float	Lattitude.
long	Float	Longitude.
sqft_living15	Integer	The square footage of interior housing living space for the nearest 15 neighbors.
sqft_lot15	Integer	The square footage of the land lots of the nearest 15 neighbors.

	id	date	price	bedrooms	bathrooms	sqft_living	sqft_lot	floors	waterfront	view	...	grade	sqft_above	sqft_basement	yr_built	yr_renovated	zipcode	lat	long	sqft_living15	sqft_lot15
0	7129300520	20141013T000000	221900.0	3	1.00	1180	5650	1.0	0	0	...	7	1180	0	1955	0	98178	47.5112	-122.257	1340	5650
1	6414100192	20141209T000000	538000.0	3	2.25	2570	7242	2.0	0	0	...	7	2170	400	1951	1991	98125	47.7210	-122.319	1690	7639
2	5631500400	20150225T000000	180000.0	2	1.00	770	10000	1.0	0	0	...	6	770	0	1933	0	98028	47.7379	-122.233	2720	8062
3	2487200875	20141209T000000	604000.0	4	3.00	1960	5000	1.0	0	0	...	7	1050	910	1965	0	98136	47.5208	-122.393	1360	5000
4	1954400510	20150218T000000	510000.0	3	2.00	1680	8080	1.0	0	0	...	8	1680	0	1987	0	98074	47.6168	-122.045	1800	7503
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
21608	263000018	20140521T000000	360000.0	3	2.50	1530	1131	3.0	0	0	...	8	1530	0	2009	0	98103	47.6993	-122.346	1530	1509
21609	6600060120	20150223T000000	400000.0	4	2.50	2310	5813	2.0	0	0	...	8	2310	0	2014	0	98146	47.5107	-122.362	1830	7200
21610	1523300141	20140623T000000	402101.0	2	0.75	1020	1350	2.0	0	0	...	7	1020	0	2009	0	98144	47.5944	-122.299	1020	2007
21611	291310100	20150116T000000	400000.0	3	2.50	1600	2388	2.0	0	0	...	8	1600	0	2004	0	98027	47.5345	-122.069	1410	1287
21612	1523300157	20141015T000000	325000.0	2	0.75	1020	1076	2.0	0	0	...	7	1020	0	2008	0	98144	47.5941	-122.299	1020	1357

	id	date	price	bedrooms	bathrooms	sqft_living	sqft_lot	floors	waterfront	view	...	grade	sqft_above	sqft_basement	yr_built	yr_renovated	zipcode	lat	long	sqft_living15	sqft_lot15
93	6021501535	20140725T000000	430000.0	3	1.50	1580	5000	1.0	0	0	...	8	1290	290	1939	0	98117	47.6870	-122.386	1570	4500
94	6021501535	20141223T000000	700000.0	3	1.50	1580	5000	1.0	0	0	...	8	1290	290	1939	0	98117	47.6870	-122.386	1570	4500
313	4139480200	20140618T000000	1384000.0	4	3.25	4290	12103	1.0	0	3	...	11	2690	1600	1997	0	98006	47.5503	-122.102	3860	11244
314	4139480200	20141209T000000	1400000.0	4	3.25	4290	12103	1.0	0	3	...	11	2690	1600	1997	0	98006	47.5503	-122.102	3860	11244
324	7520000520	20140905T000000	232000.0	2	1.00	1240	12092	1.0	0	0	...	6	960	280	1922	1984	98146	47.4957	-122.352	1820	7460
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
20670	8564860270	20150330T000000	502000.0	4	2.50	2680	5539	2.0	0	0	...	8	2680	0	2013	0	98045	47.4759	-121.734	2680	5992
20779	6300000226	20140626T000000	240000.0	4	1.00	1200	2171	1.5	0	0	...	7	1200	0	1933	0	98133	47.7076	-122.342	1130	1598
20780	6300000226	20150504T000000	380000.0	4	1.00	1200	2171	1.5	0	0	...	7	1200	0	1933	0	98133	47.7076	-122.342	1130	1598
21580	7853420110	20141003T000000	594866.0	3	3.00	2780	6000	2.0	0	0	...	9	2780	0	2013	0	98065	47.5184	-121.886	2850	6000
21581	7853420110	20150504T000000	625000.0	3	3.00	2780	6000	2.0	0	0	...	9	2780	0	2013	0	98065	47.5184	-121.886	2850	6000

	min	25%	50%	75%	max	range	median	mean	std
attributes
price	75000.0	321950.00	450000.00	645000.0	7700000.0	7625000.0	450000.00	540088.141767	367118.703181
bedrooms	0.0	3.00	3.00	4.0	33.0	33.0	3.00	3.370842	0.930040
bathrooms	0.0	1.75	2.25	2.5	8.0	8.0	2.25	2.114757	0.770145
sqft_living	290.0	1427.00	1910.00	2550.0	13540.0	13250.0	1910.00	2079.899736	918.419649
sqft_lot	520.0	5040.00	7618.00	10688.0	1651359.0	1650839.0	7618.00	15106.967566	41419.553273
condition	1.0	3.00	3.00	4.0	5.0	4.0	3.00	3.409430	0.650728
grade	1.0	7.00	7.00	8.0	13.0	12.0	7.00	7.656873	1.175432
sqft_above	290.0	1190.00	1560.00	2210.0	9410.0	9120.0	1560.00	1788.390691	828.071820
sqft_basement	0.0	0.00	0.00	560.0	4820.0	4820.0	0.00	291.509045	442.564804
sqft_living15	399.0	1490.00	1840.00	2360.0	6210.0	5811.0	1840.00	1986.552492	685.375448
sqft_lot15	651.0	5100.00	7620.00	10083.0	871200.0	870549.0	7620.00	12768.455652	27303.547963

	id	date	price	bedrooms	bathrooms	sqft_living	sqft_lot	floors	view	...	grade	sqft_above	sqft_basement	yr_built	yr_renovated	zipcode	lat	long	sqft_living15	sqft_lot15
8757	1773100755	20140821T000000	520000.0	11	3.00	3000	4960	2.0	0	...	7	2400	600	1918	1999	98106	47.5560	-122.363	1420	4960
13314	627300145	20140814T000000	1148000.0	10	5.25	4590	10920	1.0	2	...	9	2500	2090	2008	0	98004	47.5861	-122.113	2730	10400
15161	5566100170	20141029T000000	650000.0	10	2.00	3610	11914	2.0	0	...	7	3010	600	1958	0	98006	47.5705	-122.175	2040	11914
15870	2402100895	20140625T000000	640000.0	33	1.75	1620	6000	1.0	0	...	7	1040	580	1947	0	98103	47.6878	-122.331	1330	4700
19254	8812401450	20141229T000000	660000.0	10	3.00	2920	3745	2.0	0	...	7	1860	1060	1913	0	98105	47.6635	-122.320	1810	3745

Business and Analysis Understanding¶

Business Problem¶

Business Assumptions¶

Dataset Description¶

Solution Planning¶

Hypothesis¶

1. Imports¶

2. Descriptive Analysis¶

3. Data Cleaning and Preparation¶

4. Exploratory Data Analysis¶

Business Problem Nº1¶

Business Problem Nº2¶

5. Communicating Results¶

Evaluating Hypothesis¶

H1: Overlooking the Waterfront¶

H2: Built Prior to 1995¶

H3: Prices for 2 or 3 Bathrooms¶

H4: Grades and Prices¶

H5: With or Without Basements¶

H6: MoM positive Growth¶

Graphical Data Visualization¶

6. Conclusion¶

Selling suggestions
If the buying price is lower than the median for the corresponding region and seasonality, the selling price should be equal to the purchase value with a plus of 30%/.
If the buying price is higher than the median for the corresponding region and seasonality, the selling price should be equal to the purchase value with a plus of 15%.

	zip_median
zipcode
98001	260000.0
98002	235000.0
98003	267475.0
98004	1150000.0
98005	765475.0
...	...
98177	554000.0
98178	278277.0
98188	264000.0
98198	265000.0
98199	689800.0

	attributes	id	price	condition	grade	zip_median	status
	zipcode
0	98001	7895500070	240000.0	2	7	260000.0	Not to buy
1	98001	3717000160	287000.0	3	7	260000.0	Not to buy
2	98001	8961960160	480000.0	3	9	260000.0	Not to buy
3	98001	4014400292	465000.0	3	9	260000.0	Not to buy
4	98001	1115450240	360000.0	3	9	260000.0	Not to buy
...	...	...	...	...	...	...	...
21608	98199	2770601912	570000.0	3	9	689800.0	Buy
21609	98199	1070000180	1107460.0	3	9	689800.0	Not to buy
21610	98199	7010700308	1010800.0	3	9	689800.0	Not to buy
21611	98199	2770601782	453000.0	3	8	689800.0	Buy
21612	98199	6821101731	549000.0	3	8	689800.0	Buy

	day	month	season
0	13	10	Fall
1	9	12	Fall
2	25	2	Winter
3	9	12	Fall
4	18	2	Winter
...	...	...	...
21608	21	5	Spring
21609	23	2	Winter
21610	23	6	Summer
21611	16	1	Winter
21612	15	10	Fall

		seasn_median	house_totals
zipcode	season
98001	Fall	250000.0	89
	Spring	261000.0	106
	Summer	263200.0	98
	Winter	269950.0	69
98002	Fall	238000.0	49
...	...	...	...
98198	Winter	275000.0	63
98199	Fall	650000.0	81
	Spring	730000.0	103
	Summer	722500.0	82
	Winter	604000.0	51

	attributes	id	season	price	sell_season	sell_price	profit
	zipcode
3	98001	3353401710	Fall	227950.0	Fall	262142.5	34192.5
4	98001	8159610060	Fall	233000.0	Fall	267950.0	34950.0
5	98001	5066400483	Fall	249900.0	Fall	287385.0	37485.0
12	98001	3356404330	Fall	206000.0	Fall	267800.0	61800.0
13	98001	3874000240	Fall	210000.0	Fall	273000.0	63000.0
...	...	...	...	...	...	...	...
21607	98199	6917700305	Winter	529000.0	Winter	608350.0	79350.0
21609	98199	2351800065	Winter	590000.0	Winter	678500.0	88500.0
21610	98199	8127700845	Winter	375000.0	Winter	487500.0	112500.0
21611	98199	6821101732	Winter	550000.0	Winter	632500.0	82500.0
21612	98199	2770601457	Winter	542300.0	Winter	623645.0	81345.0

	attributes	id	season	price	max_renovation_outlay	sell_season	sell_price	min_profit
	zipcode
2661	98014	8655900162	Winter	156000.0	62400.0	Summer	349440.0	193440.0
3601	98024	6730700385	Fall	205000.0	82000.0	Spring	459200.0	254200.0
3763	98027	5279100625	Spring	248000.0	99200.0	Summer	555520.0	307520.0
4046	98027	2354300456	Winter	130000.0	52000.0	Summer	291200.0	161200.0
5145	98031	1822059057	Summer	152000.0	60800.0	Fall	297920.0	85120.0
5301	98032	6146600170	Summer	100000.0	40000.0	Summer	224000.0	124000.0
8990	98055	3340401535	Fall	140000.0	56000.0	Spring	313600.0	173600.0
10196	98059	3664500300	Fall	230000.0	92000.0	Spring	450800.0	128800.0
11303	98074	1925069006	Fall	355000.0	142000.0	Spring	695800.0	198800.0
13606	98106	6453300055	Fall	188000.0	75200.0	Winter	368480.0	105280.0
13728	98106	2114700500	Spring	90000.0	36000.0	Winter	201600.0	111600.0
13897	98106	2114700090	Winter	151000.0	60400.0	Winter	338240.0	187240.0
18556	98136	4322200105	Spring	229050.0	91620.0	Winter	513072.0	284022.0
21136	98198	6929602721	Spring	95000.0	38000.0	Winter	212800.0	117800.0

	total_3_bathrooms	avg_prices	MoM_variation (%)
month
1	70	690080.714	0.000
2	78	775057.564	12.314
3	68	748064.015	-3.483
4	66	730831.818	-2.304
5	68	651971.015	-10.791
6	60	643131.150	-1.356
7	52	667909.615	3.853
8	42	721742.310	8.060
9	39	636911.538	-11.754
10	49	741176.510	16.370
11	53	687297.075	-7.269
12	77	764233.974	11.194

	total_3_bathrooms	avg_prices	MoM_variation (%)
count	12.00	12.00	12.00
mean	60.17	704867.27	1.24
std	13.06	48349.95	9.21
min	39.00	636911.54	-11.75
25%	51.25	663924.97	-4.43
50%	63.00	705911.51	-0.68
75%	68.50	742898.39	8.84
max	78.00	775057.56	16.37