import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np


!git clone https://github.com/JustinHaysbert/New-York-Housing-Project.git

Cloning into 'New-York-Housing-Project'...
remote: Enumerating objects: 6, done.
remote: Counting objects: 100% (6/6), done.
remote: Compressing objects: 100% (4/4), done.
remote: Total 6 (delta 0), reused 0 (delta 0), pack-reused 0
Receiving objects: 100% (6/6), 265.76 KiB | 5.54 MiB/s, done.


!ls New-York-Housing-Project

NY-House-Dataset.csv  README.md


housing=pd.read_csv('New-York-Housing-Project/NY-House-Dataset.csv')
housing


housing.describe()


housing.set_index("ADDRESS").head()


housing.isnull().sum()
#no null values in the dataset

BROKERTITLE                    0
TYPE                           0
PRICE                          0
BEDS                           0
BATH                           0
PROPERTYSQFT                   0
ADDRESS                        0
STATE                          0
MAIN_ADDRESS                   0
ADMINISTRATIVE_AREA_LEVEL_2    0
LOCALITY                       0
SUBLOCALITY                    0
STREET_NAME                    0
LONG_NAME                      0
FORMATTED_ADDRESS              0
LATITUDE                       0
LONGITUDE                      0
dtype: int64


housing.rename(columns = {"BROKERTITLE": "broker",
                                    "TYPE": "type",
                                    "PRICE":"price",
                                    "BEDS":"beds",
                                    "PROPERTYSQFT":"sqft",
                                    "ADDRESS":"address",
                                    "STATE":"state",
                                    "MAIN_ADDRESS":"main address",
                                    "ADMINISTRATIVE_AREA_LEVEL_2":"aal",
                                    "LOCALITY":"locality",
                                    "SUBLOCALITY":"sublocality",
                                    "STREET_NAME":"street name",
                                    "LONG_NAME":"long name",
                                    "FORMATTED_ADDRESS":"formatted address",
                                    "LATITUDE":"latitude",
                                    "LONGITUDE":"longitude"
                                    }, inplace = True)


housing.head()


housing.dtypes

broker                object
type                  object
price                  int64
beds                   int64
BATH                 float64
sqft                 float64
address               object
state                 object
main address          object
aal                   object
locality              object
sublocality           object
street name           object
long name             object
formatted address     object
latitude             float64
longitude            float64
dtype: object


housing = housing.drop(["street name", "long name", "formatted address", "aal", "state" ], axis=1)


housing.head()


sns.pairplot(housing)

<seaborn.axisgrid.PairGrid at 0x7e5b74daa800>


sns.boxplot(data=housing, x='price')

<Axes: xlabel='price'>


lower_percentile = housing['price'].quantile(0.01)
upper_percentile = housing['price'].quantile(0.95)
lower_percentile
upper_percentile

6975000.0


housing = housing[(housing['price'] > lower_percentile) & (housing['price'] < upper_percentile)]


sns.boxplot(data=housing, x='price')

<Axes: xlabel='price'>


lower_percentile = housing['price'].quantile(0.01)
upper_percentile = housing['price'].quantile(0.90)
lower_percentile
upper_percentile

2495000.0


housing = housing[(housing['price'] > lower_percentile) & (housing['price'] < upper_percentile)]


sns.boxplot(data=housing, x='price')

<Axes: xlabel='price'>


sns.boxplot(data=housing, x='price', y='type')

<Axes: xlabel='price', ylabel='type'>


sns.countplot(data=housing, y='type')

<Axes: xlabel='count', ylabel='type'>


sns.scatterplot(data=housing, x = 'longitude', y='latitude', hue = 'price', size = 'price',sizes=(20, 100) ,palette="viridis")

<Axes: xlabel='longitude', ylabel='latitude'>


import plotly.express as px
figure = px.scatter_mapbox(housing,
                        lat='latitude',
                        lon='longitude',
                        color='price',
                        size='price',
                        color_continuous_scale='viridis',
                        size_max=18,
                        zoom=10,
                        height=800,
                        width=800)

figure.update_layout(mapbox_style="open-street-map")
figure.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
figure.show()


housing_corr = housing.drop(['broker','type','address', 'main address', 'locality',	'sublocality'], axis=1)


dataplot=sns.heatmap(housing_corr.corr())


housing_corr.head()


from sklearn.model_selection import train_test_split


train_set, test_set = train_test_split(housing_corr, test_size=0.2, random_state=0)


housing_test_pd = pd.DataFrame(test_set)
housing_train_pd = pd.DataFrame(train_set)


housing_train = train_set.drop("price", axis=1)
housing_train_labels = train_set["price"].copy()

housing_test = test_set.drop("price", axis=1)
housing_test_labels = test_set["price"].copy()


from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('std_scalar', StandardScaler()),
  ])
housing_num_tr_test = num_pipeline.fit_transform(housing_test)
housing_num_tr_train = num_pipeline.fit_transform(housing_train)


housing_num_tr_test.shape

(803, 5)


housing_num_tr_train.shape

(3208, 5)


housing_test_labels.shape

(803,)


housing_train_labels.shape

(3208,)


from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(housing_num_tr_train, housing_train_labels)

LinearRegression()

LinearRegression()


from sklearn.model_selection import cross_val_score
import numpy as np
lin_scores = cross_val_score(lin_reg, housing_num_tr_train, housing_train_labels,
                         scoring="neg_mean_squared_error", cv=5)
lin_rmse_scores = np.sqrt(-lin_scores)

def display_scores(scores):
  print("Scores:", scores)
  print("Mean:", scores.mean())
  print("STD:", scores.std())
display_scores(lin_rmse_scores)

Scores: [433085.32155812 403438.21725386 428537.90291829 415511.93955392
 424777.54778529]
Mean: 421070.1858138954
STD: 10541.871136367174


test_predictions = lin_reg.predict(housing_num_tr_test)

# Evaluate the model's performance on the test set
from sklearn.metrics import mean_squared_error
test_rmse = np.sqrt(mean_squared_error(housing_test_labels, test_predictions))
print("Test RMSE:", test_rmse)

Test RMSE: 418046.4584717031


from sklearn.ensemble import RandomForestRegressor
forest_reg=RandomForestRegressor()
forest_reg.fit(housing_num_tr_train, housing_train_labels)
forest_reg_scores=cross_val_score(forest_reg, housing_num_tr_train, housing_train_labels, scoring='neg_mean_squared_error', cv=10)
forest_rmse=np.sqrt(-forest_reg_scores)
def display_scores(scores):
  print("Scores:", scores)
  print("Mean:", scores.mean())
  print("STD:", scores.std())
display_scores(forest_rmse)

Scores: [283618.18566187 270380.84114225 280618.48898793 274494.33703338
 297805.27514983 266011.94923511 272809.5767164  303851.34655884
 310095.35848269 292805.08619679]
Mean: 285249.04451651016
STD: 14365.837240900035


test_predictions = forest_reg.predict(housing_num_tr_test)

# Evaluate the model's performance on the test set
from sklearn.metrics import mean_squared_error
test_rmse = np.sqrt(mean_squared_error(housing_test_labels, test_predictions))
print("Test RMSE:", test_rmse)

Test RMSE: 304976.65859331394


from sklearn.model_selection import GridSearchCV
parameters = {
    'n_estimators': [100, 150, 200, 250, 300],
    'max_depth': [1,2,3,4],
}
regr = RandomForestRegressor(random_state=0)

clf = GridSearchCV(regr, parameters)
clf.fit(housing_num_tr_train, housing_train_labels)

GridSearchCV(estimator=RandomForestRegressor(random_state=0),
             param_grid={'max_depth': [1, 2, 3, 4],
                         'n_estimators': [100, 150, 200, 250, 300]})

GridSearchCV(estimator=RandomForestRegressor(random_state=0),
             param_grid={'max_depth': [1, 2, 3, 4],
                         'n_estimators': [100, 150, 200, 250, 300]})

RandomForestRegressor(random_state=0)

RandomForestRegressor(random_state=0)


clf.best_params_

{'max_depth': 4, 'n_estimators': 250}


best_model = clf.best_estimator_


test_predictions = best_model.predict(housing_num_tr_test)

# Compute RMSE
test_rmse = np.sqrt(mean_squared_error(housing_test_labels, test_predictions))


test_rmse

346802.8294555652


housing_corr.head()


housing_new = housing_corr.drop(['latitude', 'longitude'], axis=1)


from sklearn.model_selection import train_test_split


train_set, test_set = train_test_split(housing_new, test_size=0.2, random_state=0)


housing_train = train_set.drop("price", axis=1)
housing_train_labels = train_set["price"].copy()

housing_test = test_set.drop("price", axis=1)
housing_test_labels = test_set["price"].copy()


from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('std_scalar', StandardScaler()),
  ])
housing_num_tr_test = num_pipeline.fit_transform(housing_test)
housing_num_tr_train = num_pipeline.fit_transform(housing_train)


housing_num_tr_test.shape

(803, 3)


housing_num_tr_train.shape

(3208, 3)


housing_test_labels.shape

(803,)


housing_train_labels.shape

(3208,)


from sklearn.model_selection import GridSearchCV
parameters = {
    'n_estimators': [100, 150, 200, 250, 300],
    'max_depth': [1,2,3,4],
}
regr = RandomForestRegressor(random_state=0)

clf = GridSearchCV(regr, parameters)
clf.fit(housing_num_tr_train, housing_train_labels)

GridSearchCV(estimator=RandomForestRegressor(random_state=0),
             param_grid={'max_depth': [1, 2, 3, 4],
                         'n_estimators': [100, 150, 200, 250, 300]})

GridSearchCV(estimator=RandomForestRegressor(random_state=0),
             param_grid={'max_depth': [1, 2, 3, 4],
                         'n_estimators': [100, 150, 200, 250, 300]})

RandomForestRegressor(random_state=0)

RandomForestRegressor(random_state=0)


best_model = clf.best_estimator_


test_predictions = best_model.predict(housing_num_tr_test)

# Compute RMSE
test_rmse = np.sqrt(mean_squared_error(housing_test_labels, test_predictions))


test_rmse

404993.4834963244

	BROKERTITLE	TYPE	PRICE	BEDS	BATH	PROPERTYSQFT	ADDRESS	STATE	MAIN_ADDRESS	ADMINISTRATIVE_AREA_LEVEL_2	LOCALITY	SUBLOCALITY	STREET_NAME	LONG_NAME	FORMATTED_ADDRESS	LATITUDE	LONGITUDE
0	Brokered by Douglas Elliman -111 Fifth Ave	Condo for sale	315000	2	2.000000	1400.000000	2 E 55th St Unit 803	New York, NY 10022	2 E 55th St Unit 803New York, NY 10022	New York County	New York	Manhattan	East 55th Street	Regis Residence	Regis Residence, 2 E 55th St #803, New York, N...	40.761255	-73.974483
1	Brokered by Serhant	Condo for sale	195000000	7	10.000000	17545.000000	Central Park Tower Penthouse-217 W 57th New Yo...	New York, NY 10019	Central Park Tower Penthouse-217 W 57th New Yo...	United States	New York	New York County	New York	West 57th Street	217 W 57th St, New York, NY 10019, USA	40.766393	-73.980991
2	Brokered by Sowae Corp	House for sale	260000	4	2.000000	2015.000000	620 Sinclair Ave	Staten Island, NY 10312	620 Sinclair AveStaten Island, NY 10312	United States	New York	Richmond County	Staten Island	Sinclair Avenue	620 Sinclair Ave, Staten Island, NY 10312, USA	40.541805	-74.196109
3	Brokered by COMPASS	Condo for sale	69000	3	1.000000	445.000000	2 E 55th St Unit 908W33	Manhattan, NY 10022	2 E 55th St Unit 908W33Manhattan, NY 10022	United States	New York	New York County	New York	East 55th Street	2 E 55th St, New York, NY 10022, USA	40.761398	-73.974613
4	Brokered by Sotheby's International Realty - E...	Townhouse for sale	55000000	7	2.373861	14175.000000	5 E 64th St	New York, NY 10065	5 E 64th StNew York, NY 10065	United States	New York	New York County	New York	East 64th Street	5 E 64th St, New York, NY 10065, USA	40.767224	-73.969856
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
4796	Brokered by COMPASS	Co-op for sale	599000	1	1.000000	2184.207862	222 E 80th St Apt 3A	Manhattan, NY 10075	222 E 80th St Apt 3AManhattan, NY 10075	New York	New York County	New York	Manhattan	222	222 E 80th St #3a, New York, NY 10075, USA	40.774350	-73.955879
4797	Brokered by Mjr Real Estate Llc	Co-op for sale	245000	1	1.000000	2184.207862	97-40 62 Dr Unit Lg	Rego Park, NY 11374	97-40 62 Dr Unit LgRego Park, NY 11374	United States	New York	Queens County	Queens	62nd Drive	97-40 62nd Dr, Rego Park, NY 11374, USA	40.732538	-73.860152
4798	Brokered by Douglas Elliman - 575 Madison Ave	Co-op for sale	1275000	1	1.000000	2184.207862	427 W 21st St Unit Garden	New York, NY 10011	427 W 21st St Unit GardenNew York, NY 10011	United States	New York	New York County	New York	West 21st Street	427 W 21st St, New York, NY 10011, USA	40.745882	-74.003398
4799	Brokered by E Realty International Corp	Condo for sale	598125	2	1.000000	655.000000	91-23 Corona Ave Unit 4G	Elmhurst, NY 11373	91-23 Corona Ave Unit 4GElmhurst, NY 11373	New York	Queens County	Queens	Flushing	91-23	91-23 Corona Ave. #4b, Flushing, NY 11373, USA	40.742770	-73.872752
4800	Brokered by Nyc Realty Brokers Llc	Co-op for sale	349000	1	1.000000	750.000000	460 Neptune Ave Apt 14O	Brooklyn, NY 11224	460 Neptune Ave Apt 14OBrooklyn, NY 11224	New York	Kings County	Brooklyn	Coney Island	460	460 Neptune Ave #14a, Brooklyn, NY 11224, USA	40.579147	-73.970949

	PRICE	BEDS	BATH	PROPERTYSQFT	LATITUDE	LONGITUDE
count	4.801000e+03	4801.000000	4801.000000	4801.000000	4801.000000	4801.000000
mean	2.356940e+06	3.356801	2.373861	2184.207862	40.714227	-73.941601
std	3.135525e+07	2.602315	1.946962	2377.140894	0.087676	0.101082
min	2.494000e+03	1.000000	0.000000	230.000000	40.499546	-74.253033
25%	4.990000e+05	2.000000	1.000000	1200.000000	40.639375	-73.987143
50%	8.250000e+05	3.000000	2.000000	2184.207862	40.726749	-73.949189
75%	1.495000e+06	4.000000	3.000000	2184.207862	40.771923	-73.870638
max	2.147484e+09	50.000000	50.000000	65535.000000	40.912729	-73.702450

	BROKERTITLE	TYPE	PRICE	BEDS	BATH	PROPERTYSQFT	STATE	MAIN_ADDRESS	ADMINISTRATIVE_AREA_LEVEL_2	LOCALITY	SUBLOCALITY	STREET_NAME	LONG_NAME	FORMATTED_ADDRESS	LATITUDE	LONGITUDE
ADDRESS
2 E 55th St Unit 803	Brokered by Douglas Elliman -111 Fifth Ave	Condo for sale	315000	2	2.000000	1400.0	New York, NY 10022	2 E 55th St Unit 803New York, NY 10022	New York County	New York	Manhattan	East 55th Street	Regis Residence	Regis Residence, 2 E 55th St #803, New York, N...	40.761255	-73.974483
Central Park Tower Penthouse-217 W 57th New York St Unit Penthouse	Brokered by Serhant	Condo for sale	195000000	7	10.000000	17545.0	New York, NY 10019	Central Park Tower Penthouse-217 W 57th New Yo...	United States	New York	New York County	New York	West 57th Street	217 W 57th St, New York, NY 10019, USA	40.766393	-73.980991
620 Sinclair Ave	Brokered by Sowae Corp	House for sale	260000	4	2.000000	2015.0	Staten Island, NY 10312	620 Sinclair AveStaten Island, NY 10312	United States	New York	Richmond County	Staten Island	Sinclair Avenue	620 Sinclair Ave, Staten Island, NY 10312, USA	40.541805	-74.196109
2 E 55th St Unit 908W33	Brokered by COMPASS	Condo for sale	69000	3	1.000000	445.0	Manhattan, NY 10022	2 E 55th St Unit 908W33Manhattan, NY 10022	United States	New York	New York County	New York	East 55th Street	2 E 55th St, New York, NY 10022, USA	40.761398	-73.974613
5 E 64th St	Brokered by Sotheby's International Realty - E...	Townhouse for sale	55000000	7	2.373861	14175.0	New York, NY 10065	5 E 64th StNew York, NY 10065	United States	New York	New York County	New York	East 64th Street	5 E 64th St, New York, NY 10065, USA	40.767224	-73.969856

	broker	type	price	beds	BATH	sqft	address	state	main address	aal	locality	sublocality	street name	long name	formatted address	latitude	longitude
0	Brokered by Douglas Elliman -111 Fifth Ave	Condo for sale	315000	2	2.000000	1400.0	2 E 55th St Unit 803	New York, NY 10022	2 E 55th St Unit 803New York, NY 10022	New York County	New York	Manhattan	East 55th Street	Regis Residence	Regis Residence, 2 E 55th St #803, New York, N...	40.761255	-73.974483
1	Brokered by Serhant	Condo for sale	195000000	7	10.000000	17545.0	Central Park Tower Penthouse-217 W 57th New Yo...	New York, NY 10019	Central Park Tower Penthouse-217 W 57th New Yo...	United States	New York	New York County	New York	West 57th Street	217 W 57th St, New York, NY 10019, USA	40.766393	-73.980991
2	Brokered by Sowae Corp	House for sale	260000	4	2.000000	2015.0	620 Sinclair Ave	Staten Island, NY 10312	620 Sinclair AveStaten Island, NY 10312	United States	New York	Richmond County	Staten Island	Sinclair Avenue	620 Sinclair Ave, Staten Island, NY 10312, USA	40.541805	-74.196109
3	Brokered by COMPASS	Condo for sale	69000	3	1.000000	445.0	2 E 55th St Unit 908W33	Manhattan, NY 10022	2 E 55th St Unit 908W33Manhattan, NY 10022	United States	New York	New York County	New York	East 55th Street	2 E 55th St, New York, NY 10022, USA	40.761398	-73.974613
4	Brokered by Sotheby's International Realty - E...	Townhouse for sale	55000000	7	2.373861	14175.0	5 E 64th St	New York, NY 10065	5 E 64th StNew York, NY 10065	United States	New York	New York County	New York	East 64th Street	5 E 64th St, New York, NY 10065, USA	40.767224	-73.969856

	broker	type	price	beds	BATH	sqft	address	main address	locality	sublocality	latitude	longitude
0	Brokered by Douglas Elliman -111 Fifth Ave	Condo for sale	315000	2	2.000000	1400.0	2 E 55th St Unit 803	2 E 55th St Unit 803New York, NY 10022	New York	Manhattan	40.761255	-73.974483
1	Brokered by Serhant	Condo for sale	195000000	7	10.000000	17545.0	Central Park Tower Penthouse-217 W 57th New Yo...	Central Park Tower Penthouse-217 W 57th New Yo...	New York	New York County	40.766393	-73.980991
2	Brokered by Sowae Corp	House for sale	260000	4	2.000000	2015.0	620 Sinclair Ave	620 Sinclair AveStaten Island, NY 10312	New York	Richmond County	40.541805	-74.196109
3	Brokered by COMPASS	Condo for sale	69000	3	1.000000	445.0	2 E 55th St Unit 908W33	2 E 55th St Unit 908W33Manhattan, NY 10022	New York	New York County	40.761398	-73.974613
4	Brokered by Sotheby's International Realty - E...	Townhouse for sale	55000000	7	2.373861	14175.0	5 E 64th St	5 E 64th StNew York, NY 10065	New York	New York County	40.767224	-73.969856

New York Housing Data EDA and Regression¶

By: Justin Haysbert¶

Project Goals¶

Dataset¶

Work Plan¶

ETL¶

Load in the dataset¶

Lets look at the details¶

Lets check for any null values¶

Rename the columns because I don't like that they are in all caps¶

EDA¶

Takeaways and Questions¶

Machine Learning Section¶

Final Results¶

Final Thoughts¶