Analysis of 311 Street Flooding Complaints and NYC DOF Property Assessment & Valuation Market Values#

import geopandas as gpd
import pandas as pd
import plotly.express as px
import numpy as np
import requests
from urllib.request import urlopen
from scipy.stats import norm
import json
import plotly
import matplotlib.ticker as ticker
import matplotlib.dates as mdates
from matplotlib.ticker import FuncFormatter
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable
import seaborn as sns
pd.options.display.max_rows = 100
pd.options.display.max_columns = 100
from nwConfig import socrataAppToken

Get data#

311 flood complaints data#

# socrata app token
appToken = socrataAppToken

# create API request variables
headers = {'X-App-Token': appToken}
search_api_url = 'https://data.cityofnewyork.us/resource/erm2-nwe9.json'
params = {'descriptor': 'Street Flooding (SJ)', '$limit': 50000}

# call API with pandas requests method
response = requests.get(search_api_url, 
                        headers=headers, 
                        params=params, 
                        timeout=20)

# return a dictionary
data_dict = response.json()

# create dataframe from dictionary
raw311 = pd.DataFrame.from_dict(data_dict)

raw311.head()
# created date column parse to datatime
raw311['created_date'] = pd.to_datetime(raw311.loc[:,'created_date'])
raw311.info()

PLUTO data#

# declare datatypes for columns
dtypes = {'borough': str, 'block': str, 'lot': str, 'community board': str, 'census tract 2010': str,
       'cb2010': str, 'schooldist': str, 'council district': str, 'postcode': str, 'address': str,
       'zonedist1': str,
       'bldgclass': str, 'landuse': str, 'ownertype': str, 'ownername': str,
       'bbl': str,
       'tract2010': str}

# read text file
rawPluto = pd.read_csv(r"C:\Users\markf\Documents\Jupyter_Notebooks\311\data\Primary_Land_Use_Tax_Lot_Output__PLUTO_.csv", dtype=dtypes)
# identify pluto columns to keep from data dictionary description
cols_to_keep = ['borough', 'block', 'lot', 'community board', 'census tract 2010',
       'cb2010', 'schooldist', 'council district', 'postcode', 'address',
       'zonedist1',
       'bldgclass', 'landuse', 'ownertype', 'ownername',
       'lotarea',
       'unitsres', 'unitstotal',
       'assessland', 'assesstot',
       'yearbuilt', 'bbl',
       'tract2010', 'xcoord', 'ycoord', 'latitude', 'longitude',
       'firm07_flag', 'pfirm15_flag']

PLUTO =rawPluto[cols_to_keep]
# remove outliers from yearbuilt
PLUTO['yearbuilt'] = PLUTO.loc[PLUTO['yearbuilt'] > 1750, 'yearbuilt']

Add counts of 311 flood complaints to PLUTO BBLS#

# create new raw311 df aggregated by complaints per bbl
flood_complaints_by_bbl = raw311[['bbl', 'unique_key']].groupby('bbl').count().reset_index().rename(columns={'unique_key': 'flood complaints'})
flood_complaints_by_bbl.info()
flood_complaints_by_bbl.head()
# merge PLUTO dataframe to 311 dataframe

PLUTO_311 = flood_complaints_by_bbl.merge(PLUTO, 
       how = 'right', left_on= 'bbl', right_on='bbl')
PLUTO_311.info()
PLUTO_311.head()

Merge NYC DOF Property Assessment & Valuation market values to PLUTO by bbl#

# import property valuation data

# declare datatypes for columns
dtypes = {'boro': str, 'block': str, 'lot': str, 'parid': str, 'ZIP_CODE': str, 'bbl': str}

# read text file
finRaw = pd.read_csv(r'C:\Users\markf\Documents\Jupyter_Notebooks\311\data\finData.csv', dtype=dtypes)
# take out all condominium building classes except R0 - Special Condominium Billing Lot
# this filter approximately matches the PLUTO list of BBLS
fin_cond_filter = finRaw.loc[~finRaw['BLDG_CLASS'].isin(['RA', 'RB', 'RG', 'RH', 'RK', 'RP', 'RR', 'RS', 'RT', 'RW',
                                                         'R1', 'R2', 'R3', 'R4', 'R5', 'R6', 'R7', 'R8', 'R9', 'RR'])]
# df filtered for 2023 market values (Note: only one valuation for year 2023, but care must be taken as other years have two rows for each year)
market_2023 = fin_cond_filter[fin_cond_filter['YEAR'] == 2023]
# create dataframe of market value grouped by bbl
market_bbl_2023 = market_2023[['bbl', 'PYMKTTOT']].groupby('bbl').sum()
market_bbl_2023.describe()
# merge pluto_311 df with DOF property market values by BBL
pluto_311_dof = PLUTO_311.merge(market_bbl_2023, how='left', left_on='bbl', right_on='bbl')
pluto_311_dof.info()
pluto_311_dof.describe()
# histogram of BBL market values
pluto_311_dof.loc[(pluto_311_dof['PYMKTTOT'] > 100000) & (pluto_311_dof['PYMKTTOT'] < 10000000),'PYMKTTOT'].hist()
print(f"% of BBLS greater than $100K: {len(pluto_311_dof.loc[(pluto_311_dof['PYMKTTOT'] > 100000),'PYMKTTOT'])/len(pluto_311_dof)}")
pluto_311_dof.head()
pluto_311_dof.dtypes

Choropleth plot of 311 complaints per zip code#

# use GeoJSON file from NYC opendata which contains GIS data for zip code boundaries in NYC
# web page with info on data set url here: 
# https://data.cityofnewyork.us/Health/Modified-Zip-Code-Tabulation-Areas-MODZCTA-/pri4-ifjk
zip_codes = gpd.read_file('https://data.cityofnewyork.us/resource/pri4-ifjk.geojson')
zip_codes.rename(columns={'geometry': 'zip geometry'}, inplace=True)
zip_codes.head()
# merge zip code geometries 
PLUTO_311_dof_zip = zip_codes[['modzcta', 'zip geometry']].merge(pluto_311_dof, how = 'right', left_on = 'modzcta', right_on = 'postcode')
PLUTO_311_dof_zip.rename(columns={'PYMKTTOT': 'Valuation'}, inplace=True)
PLUTO_311_dof_zip.info()
# use GeoJSON file from NYC opendata which contains GIS data for zip code boundaries in NYC
# web page with info on data set url here: 
# https://data.cityofnewyork.us/Health/Modified-Zip-Code-Tabulation-Areas-MODZCTA-/pri4-ifjk

boro = 'BK'

with urlopen('https://data.cityofnewyork.us/resource/pri4-ifjk.geojson') as response:
    zip_codes = json.load(response)
# date of latest and earliest complaint
latest, earliest = raw311['created_date'].max().strftime("%m/%d/%Y"), raw311['created_date'].min().strftime("%m/%d/%Y")
# create df of flood complaints aggregated by zip code
data = PLUTO_311_dof_zip[['postcode', 'flood complaints']].loc[(PLUTO_311_dof_zip['borough'] == 'BK'),:].groupby('postcode').sum().reset_index()
# today's date
today = pd.to_datetime("today").strftime("%m/%d/%Y")

fig = px.choropleth_mapbox(data, geojson=zip_codes, locations='postcode', color='flood complaints',
                           featureidkey='properties.modzcta',
                           color_continuous_scale="blues",
                           range_color=(0, np.max(data['flood complaints'])),
                           mapbox_style="carto-positron",
                           zoom=9.25, center = {"lat": 40.743, "lon": -73.988},
                           opacity=0.5,
                           labels={},
                           title=f"NYC Flood Complaints by ZIP Code in {boro}: {earliest} - {latest}",
                           width=600
                           #height=600
                          ).update(layout=dict(title=dict(x=0.5)))
fig.update_layout(margin={"r":0,"t":30,"l":0,"b":0})
fig.show()

Choropleth plot of avg assessed total value bbl by zip code#

# use GeoJSON file from NYC opendata which contains GIS data for zip code boundaries in NYC
# web page with info on data set url here: 
# https://data.cityofnewyork.us/Health/Modified-Zip-Code-Tabulation-Areas-MODZCTA-/pri4-ifjk

boro = 'BK'

with urlopen('https://data.cityofnewyork.us/resource/pri4-ifjk.geojson') as response:
    zip_codes = json.load(response)

max_zip_dict = {'BK': '11201'}
# create df of flood complaints aggregated by zip code
data = PLUTO_311_dof_zip.loc[(PLUTO_311_dof_zip['borough'] == 'BK') 
                                & (PLUTO_311_dof_zip['Valuation'] > 0),['postcode', 'Valuation']].groupby('postcode').mean().reset_index()
# today's date


fig = px.choropleth_mapbox(data, geojson=zip_codes, locations='postcode', color='Valuation',
                           featureidkey='properties.modzcta',
                           color_continuous_scale="reds",
                           range_color=(0, np.max(data.loc[data['postcode'] == '11201', 'Valuation'])),
                           mapbox_style="carto-positron",
                           zoom=9.25, center = {"lat": 40.743, "lon": -73.988},
                           opacity=0.5,
                           labels={'Valuation': 'Valuation ($)'},
                           title=f"NYC DOF Avg BBL Valuation by ZIP Code in {boro} 2023",
                           width=600
                          ).update(layout=dict(title=dict(x=0.5)))
fig.update_layout(margin={"r":0,"t":30,"l":0,"b":0})
fig.show()
lst = []

for boro in ['BK', 'MN', 'QN', 'BX', 'SI']:

    # correlation
    vals = PLUTO_311_dof_zip.loc[(PLUTO_311_dof_zip['borough'] == boro) 
                & (PLUTO_311_dof_zip['Valuation'] > 0),['postcode', 'Valuation']].groupby('postcode').mean().reset_index()
    floods = PLUTO_311_dof_zip[['postcode', 'flood complaints']].loc[(PLUTO_311_dof_zip['borough'] == boro),:].groupby('postcode').sum().reset_index()

    # evaluate correlations 
    cor = vals.merge(floods, how = 'inner', left_on='postcode', right_on='postcode')
    spearman = cor.corr(method = 'spearman')
    pearson = cor.corr(method = 'pearson')
    kendall = cor.corr(method = 'kendall')

    # create df of correlations
    d = {'Spearman': [spearman.iloc[0,1]], 'Pearson': [pearson.iloc[0,1]], 'Kendall': [kendall.iloc[0,1]]}
    df = pd.DataFrame(d, index = [boro])
    lst.append(df)

correlations = pd.concat([x for x in lst])
sns.heatmap(correlations, cmap="YlGnBu", annot=True)
correlations = pd.concat([x for x in lst])
correlations.to_csv('correlations.csv')
# look at top Valuation in specific borough and zip
data = PLUTO_311_dof_zip.loc[PLUTO_311_dof_zip['borough'] == 'BK',['postcode', 'Valuation']].groupby('postcode').mean().reset_index()
np.max(data.loc[data['postcode'] == '11201', 'Valuation'])
# loook at top 10 avg Valuations in a particular zip code
PLUTO_311_dof_zip.loc[PLUTO_311_dof_zip['borough'] == 'BK',['postcode', 'Valuation']].groupby('postcode').mean().reset_index().sort_values(by = 'Valuation').tail(10)

Scatter Map comparing PLUTO assesstot for flood complaint versus non flood complaint bbls#

PLUTO_311_dof_zip.info()
# add flood comlaint flag column
PLUTO_311_dof_zip['flood complaint flag'] = PLUTO_311_dof_zip.loc[:,'flood complaints'].apply(lambda x: 1 if x>0 else 0)
# plotly express map to show BBLs and valuation in scatter plot for specific post code

#remove na values from assesstot column
PLUTO_311_scatter = PLUTO_311_dof_zip[~PLUTO_311_dof_zip['Valuation'].isnull()]
# remove 0 values from assesstot column
PLUTO_311_scatter = PLUTO_311_scatter[(PLUTO_311_scatter['Valuation'] > 0) & (PLUTO_311_scatter['Valuation'] < 1000000)]
# create log(assesstot) column for plot as there are large outliers
#PLUTO_311_scatter['log(assesstot)'] = PLUTO_311_scatter.loc[:, 'assesstot'].apply(lambda x: np.log(x))
# remove assesstot > 100,000,000


fig = px.scatter_mapbox(PLUTO_311_scatter.loc[PLUTO_311_scatter['postcode'] == '11234', :],
                    lat='latitude',
                    lon='longitude',
                    color='flood complaint flag',
                    size='Valuation',
                    zoom=9,
                    width=800,
                    color_discrete_sequence = ['blue','yellow' ]
                    )
fig.update_layout(mapbox_style="carto-positron")
fig.update_layout(margin={"r":0,"t":30,"l":0,"b":0})
fig.show()
PLUTO_311_scatter.head()
# histogram of Valuations

data = PLUTO_311_dof_zip[(PLUTO_311_dof_zip['Valuation']>100000) & (PLUTO_311_dof_zip['Valuation'] < 100000000)]

print(f"{len(PLUTO_311_dof_zip[(PLUTO_311_dof_zip['Valuation']>100000) & (PLUTO_311_dof_zip['Valuation'] < 100000000)])/len(PLUTO_311_dof_zip)*100}% of NYC BBLs are between $100K and 100M")
print(f"Mean: {np.mean(data['Valuation'])}")
print(f"Std: {np.std(data['Valuation'])}")

plot = px.histogram(data, x="Valuation", marginal="box", 
                    opacity = 0.5, histnorm="probability density", title="Histogram of NYC DOF Valuations between $100K and $100M", 
                    labels= {"Valuation": "Valuation ($)"})
plot.show()
# histogram of valuations with normal curve

hist_vals = PLUTO_311_dof_zip.loc[(PLUTO_311_dof_zip['Valuation'] > 1000) & (PLUTO_311_dof_zip['Valuation'] < 10000000), 'Valuation']
num_bins = 100
mu = np.mean(hist_vals)
sigma = np.std(hist_vals)
plt.hist(hist_vals,
         density=True,
         bins = num_bins,
         alpha = 0.7,
         #range = (0,200000)
         )

# Plot the PDF.
xmin, xmax = plt.xlim()
x = np.linspace(xmin, xmax, 100)
p = norm.pdf(x, mu, sigma)

plt.xlabel('Valuation')

plt.plot(x, p, 'k', linewidth=2)
title = "NYC Property Valuations by BBL between \$1,000 and \$10,000,000"
plt.title(title)
current_values = plt.gca().get_xticks()
plt.gca().set_xticklabels(['\${:,.0f}'.format(x) + 'M' for x in current_values/1000000])
plt.show()