EDA: PLUTO, 311, DOF#
import geopandas as gpd
import pandas as pd
import plotly.express as px
import numpy as np
import requests
from urllib.request import urlopen
import json
import plotly
import matplotlib
import matplotlib.dates as mdates
from matplotlib.ticker import FuncFormatter
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable
import seaborn as sns
pd.options.display.max_rows = 100
pd.options.display.max_columns = 100
from nwConfig import socrataAppToken
Import data#
311 street flooding data#
# socrata app token
appToken = socrataAppToken
# create API request variables
headers = {'X-App-Token': appToken}
search_api_url = 'https://data.cityofnewyork.us/resource/erm2-nwe9.json'
params = {'descriptor': 'Street Flooding (SJ)', '$limit': 50000}
# call API with pandas requests method
response = requests.get(search_api_url,
headers=headers,
params=params,
timeout=20)
# return a dictionary
data_dict = response.json()
raw311 = pd.DataFrame.from_dict(data_dict)
raw311.head()
MapPLUTO data#
# # import MapPLUTO dataset - limit to 10 rows because it's taking too long to process
# mapPLUTO = gpd.read_file(r'D:\Documents\Jupyter_Notebooks\311\data\mapPLUTO\MapPLUTO.shp', rows=10)
PLUTO data#
# declare datatypes for columns
dtypes = {'borough': str, 'block': str, 'lot': str, 'community board': str, 'census tract 2010': str,
'cb2010': str, 'schooldist': str, 'council district': str, 'postcode': str, 'address': str,
'zonedist1': str,
'bldgclass': str, 'landuse': str, 'ownertype': str, 'ownername': str,
'bbl': str,
'tract2010': str}
# read text file
rawPluto = pd.read_csv(r"D:\Documents\Jupyter_Notebooks\311\data\Primary_Land_Use_Tax_Lot_Output__PLUTO_.csv", dtype=dtypes)
Dept of finance Property Assessment & Valuation data sample response#
https://data.cityofnewyork.us/City-Government/Property-Valuation-and-Assessment-Data/yjxr-fw8i
# create API request variables
headers = {'X-App-Token': appToken}
search_api_url = 'https://data.cityofnewyork.us/resource/yjxr-fw8i.json'
params = {'$limit': 10, '$select': 'BBLE, BORO, BLOCK, LOT, EASEMENT, BLDGCL, TAXCLASS, FULLVAL, STADDR, ZIP, YEAR'}
# call API with pandas requests method
response = requests.get(search_api_url,
headers=headers,
params=params,
timeout=20)
# return a dictionary
data_dict = response.json()
# convert dictionary to dataframe
data = pd.DataFrame.from_dict(data_dict)
for i in range(len(data_dict)):
print(data_dict[i])
data.head(10)
Dept of finance Property Assessment & Valuation tax class 1, 2, 3, 4 data from API#
# socrata app token
appToken = socrataAppToken
# empty list to collect API request data
lst = []
# offset to increment requests through full dataset
offset = 0
for i in range(70):
try:
# create API request variables
headers = {'X-App-Token': appToken}
search_api_url = 'https://data.cityofnewyork.us/resource/8y4t-faws.json'
params = {'$limit': 100000, '$select': 'parid, boro, block, lot, BLDG_CLASS, PYMKTTOT, TENTAXCLASS, YEAR, HOUSENUM_LO, HOUSENUM_HI, STREET_NAME, ZIP_CODE', '$offset': offset}
# call API with pandas requests method
response = requests.get(search_api_url,
headers=headers,
params=params,
timeout=20)
# return a dictionary
data_dict = response.json()
# convert dictionary to dataframe
data = pd.DataFrame.from_dict(data_dict)
# append dataframe to list
lst.append(data)
# add 1000 to offset
offset += 100000
print(f'Current offset number: {offset}')
except Exception as ex:
print(f'Exception: {ex}\nexit loop')
break
fin = pd.concat(lst)
# create bbl column
fin['block'] = fin['block'].apply(lambda x: x.zfill(5))
fin['lot'] = fin['lot'].apply(lambda x: x.zfill(4))
fin['bbl'] = fin['boro'] + fin['block'] + fin['lot']
# create csv file
fin.to_csv(r'D:\Documents\Jupyter_Notebooks\311\data\finData.csv')
fin.info()
fin.head()
fin.to_csv(r'D:\Documents\Jupyter_Notebooks\311\data\finData.csv')
Import - Property Valuation and Assessment tax class 1, 2, 3, 4 from the NYC Dept of Finance EDA#
# declare datatypes for columns
dtypes = {'boro': str, 'block': str, 'lot': str, 'parid': str, 'ZIP_CODE': str, 'bbl': str}
# read text file
finRaw = pd.read_csv(r'D:\Documents\Jupyter_Notebooks\311\data\finData.csv', dtype=dtypes)
finRaw.info()
finRaw.head()
finRaw.tail()
finRaw['YEAR'].unique()
finRaw.describe()
finRaw.isna().sum()
finRaw.loc[finRaw['PYMKTTOT'] < 100000,'PYMKTTOT'].hist()
# print number of unique parid and bbl in whole data set
print(f"There are {finRaw['parid'].nunique()} unique parid")
print(f"There are {finRaw['bbl'].nunique()} unique bbl ")
# print number of unique parid and bbl in one year
year = 2021
print(f"There are {finRaw.loc[finRaw['YEAR'] == year,'parid'].nunique()} unique parid in {year}")
print(f"There are {finRaw.loc[finRaw['YEAR'] == year, 'bbl'].nunique()} unique bbl in {year}")
# list of bbls from df
bbl_list = finRaw['bbl'].values.tolist()
bbl_list[0:10]
finRaw[finRaw['bbl'].isin(bbl_list[0:10])].sort_values(by='bbl').head()
# take out all condominium building classes except R0 - Special Condominium Billing Lot
# this filter approximately matches the PLUTO list of BBLS
fin_cond_filter = finRaw.loc[~finRaw['BLDG_CLASS'].isin(['RA', 'RB', 'RG', 'RH', 'RK', 'RP', 'RR', 'RS', 'RT', 'RW',
'R1', 'R2', 'R3', 'R4', 'R5', 'R6', 'R7', 'R8', 'R9', 'RR'])]
# print number of unique parid and bbl in whole data set
print(f"There are {fin_cond_filter['parid'].nunique()} unique parid")
print(f"There are {fin_cond_filter['bbl'].nunique()} unique bbl ")
# print number of unique parid and bbl in one year
year = 2021
print(f"There are {fin_cond_filter.loc[finRaw['YEAR'] == year,'parid'].nunique()} unique parid in {year}")
print(f"There are {fin_cond_filter.loc[finRaw['YEAR'] == year, 'bbl'].nunique()} unique bbl in {year}")
# df filtered for 2023 market values
market_2023 = fin_cond_filter[fin_cond_filter['YEAR'] == 2023]
# create dataframe of market value grouped by bbl
market_bbl_2023 = market_2023[['bbl', 'PYMKTTOT']].groupby('bbl').sum()
market_bbl_2023.info()
market_bbl_2023.head()
market_bbl_2023.describe()
PLUTO EDA#
rawPluto.columns
# identify pluto columns to keep from data dictionary description
cols_to_keep = ['borough', 'block', 'lot', 'community board', 'census tract 2010',
'cb2010', 'schooldist', 'council district', 'postcode', 'address',
'zonedist1',
'bldgclass', 'landuse', 'ownertype', 'ownername',
'lotarea',
'unitsres', 'unitstotal',
'assessland', 'assesstot',
'yearbuilt', 'bbl',
'tract2010', 'xcoord', 'ycoord', 'latitude', 'longitude',
'firm07_flag', 'pfirm15_flag']
PLUTO =rawPluto[cols_to_keep]
PLUTO.head()
PLUTO.info()
# look at values in flood map column
PLUTO['pfirm15_flag'].value_counts()
PLUTO.hist(figsize=(15,15))
PLUTO.describe()
Take a closer look at columns with outliers#
# print out boxplot of outliers for each columns
for column in PLUTO.columns:
if(pd.api.types.is_numeric_dtype(PLUTO[column])):
mean = PLUTO[column].mean()
std = PLUTO[column].std()
new_df = PLUTO[(PLUTO[column] < (mean - 3 * std)) | (PLUTO[column] > (mean + 3 * std))]
print("===========================================")
print(f"Column: {column}")
print(f"Mean: {mean}")
print(f"Lower Bound: {mean - 3 * std}, Upper bound: {mean + 3 * std}")
print(f"Outlier count: {new_df.shape[0]}")
plt.boxplot(new_df[column])
plt.show()
print("===========================================")
yearbuilt#
PLUTO['yearbuilt'].hist()
# remove outliers from yearbuilt
PLUTO['yearbuilt'] = PLUTO.loc[PLUTO['yearbuilt'] > 1750, 'yearbuilt']
assesstot#
PLUTO['assesstot'].hist()
PLUTO.loc[PLUTO['assesstot'] > 1000000000]
PLUTO.loc[PLUTO['assesstot'] < 1000000, :].info()
unitstotal#
PLUTO['unitstotal'].hist()
Map PLUTO flood bbls#
# filter PLUTO to get flood dataframe for pfirm 2015 flood map
flood_bbls = PLUTO[PLUTO['pfirm15_flag'] == 1]
flood_bbls.info()
flood_bbls[['xcoord', 'ycoord']].info()
# create geodataframe
PLUTO_flood_gdf = gpd.GeoDataFrame(flood_bbls,
geometry=gpd.points_from_xy(flood_bbls['xcoord'],
flood_bbls['ycoord']))
# set crs
PLUTO_flood_gdf = PLUTO_flood_gdf.set_crs(epsg=2263)
PLUTO_flood_gdf.crs
PLUTO_flood_gdf.head()
PLUTO_flood_gdf.plot()
# plotly express map
fig = px.scatter_mapbox(flood_bbls,
lat='latitude',
lon='longitude',
hover_name='flood ',
hover_data=['bbl'],
zoom=9
)
fig.update_layout(mapbox_style="carto-positron")
fig.update_layout(margin={"r":0,"t":30,"l":0,"b":0})
fig.show()
# geo_df = gpd.read_file(gpd.datasets.get_path('naturalearth_cities'))
# #px.set_mapbox_access_token(open(".mapbox_token").read())
# fig = px.scatter_geo(geo_df,
# lat=geo_df.geometry.y,
# lon=geo_df.geometry.x,
# hover_name="name")
# fig.show()
311 EDA#
# previewing first five rows in data
print('shape of data: {}'.format(raw311.shape))
raw311.head()
# printing True/False if column is unique on our unique key (DOITT_ID)
raw311['unique_key'].is_unique
raw311['descriptor'].value_counts()
print('Sum of nulls in column:')
raw311.isnull().sum().sort_values(ascending=False)
raw311[['x_coordinate_state_plane', 'y_coordinate_state_plane']].info()
print('Dropping nulls in the x, y coordinate columns and creating a geodataframe.')
to_points = raw311.loc[raw311['x_coordinate_state_plane'].notnull()]
gdf = gpd.GeoDataFrame(to_points,
geometry=gpd.points_from_xy(to_points['x_coordinate_state_plane'],
to_points['y_coordinate_state_plane']))
gdf.head()
print('Our new dataframe contains {:,} rows.'.format(len(gdf)))
total = (len(raw311) - len(gdf))
percent = round((1 - len(gdf)/len(raw311)) * 100, 2)
print("We've dropped {} rows or {}% of our data for mapping purposes.".format(total, percent))
gdf[['geometry']].info()
print('Nulls in our x, y coordinate columns:')
gdf[['x_coordinate_state_plane', 'y_coordinate_state_plane', 'geometry']].isnull().sum()
gdf['created_date'] = pd.to_datetime(gdf['created_date'])
gdf['year'] = gdf['created_date'].dt.year
gdf[['created_date']].info()
print('Summary statistics on our date column.')
gdf[['created_date', 'year']].describe(datetime_is_numeric=True)
print(type(gdf))
gdf = gdf.set_crs(epsg=2263)
gdf.crs
# read in nta shapefile
url = 'https://data.cityofnewyork.us/api/geospatial/cpf4-rkhq?method=export&format=GeoJSON'
nta_shape = gpd.read_file(url)
print('shape of data: {}'.format(len(nta_shape)))
nta_shape.head()
nta_shape = nta_shape.to_crs(epsg=2263)
nta_shape.crs
fig, ax = plt.subplots(figsize=(8, 8))
#latest, earliest = gdf['created_date'].max().strftime("%m/%d/%Y"), raw311['created_date'].min().strftime("%m/%d/%Y")
latest, earlier = np.max(gdf['created_date']), np.min(gdf['created_date'])
hb = ax.hexbin(gdf['x_coordinate_state_plane'],
gdf['y_coordinate_state_plane'],
gridsize=30,
cmap='Blues')
divider = make_axes_locatable(ax)
cax = divider.append_axes("right", size="5%", pad=0.1)
plt.colorbar(hb, cax=cax)
nta_shape.plot(facecolor="none",
edgecolor="black",
linewidth=.2,
ax=ax)
ax.set_title("Count of NYC 311 Street Flooding Complaints from 2010 to 2023", fontsize=13)
ax.axis('off')
plt.tight_layout()
print(f'earliest:{earlier}, latest:{latest}')