Street Flooding & MapPLUTO#
Merge the NYC Street Flooding Service Requests dataset with MapPLUTO
Import Libraries#
Standard Libraries#
import os
import json
Custom Libraries#
from utilities import (
check_file_exists as cfe
)
External Libraries#
import geopandas as gpd
import pandas as pd
import fiona
from shapely import wkt
Define Variables#
Input Files#
data_stats_json_input = 'data/data-stats.json'
Output Files#
nyc_street_flooding_gdf_columns_output = 'data/nyc_street_flooding_gdf_columns.txt'
pluto_gdf_columns_output = 'data/pluto_gdf_columns.txt'
current_script_path = os.path.realpath(os.path.curdir)
merge_flood_mappluto_output_ofgdb = 'data/merge/nyc-street-flooding-mappluto.gdb'
merge_flood_mappluto_output_geojson = 'data/merge/nyc-street-flooding-mappluto.geojson'
# pluto_geojson = 'data/PLUTO/pluto_22v3_1.geojson'
# pluto_geojson_full_path = f'{current_script_path}/{pluto_geojson}'
NYC Street Flooding Complaints#
nyc_street_flooding_geojson = 'data/street-flooding/street-flood-complaints_rows-all.geojson'
MapPLUTO#
map_pluto_gdb_folder = 'data/PLUTO/MapPLUTO_22v3_1_water_included.gdb'
%%script echo "[skip] Reason: using gdb file type"
map_pluto_shp_folder = 'data/PLUTO/MapPLUTO_22v3_1_water_included.shp'
[skip] Reason: using gdb file type
Combine Street Flooding Complaints
and MapPLUTO
Datasets#
Read Street Flooding Complaints
#
street_flooding_gdf = gpd.read_file(
nyc_street_flooding_geojson,
driver = 'GeoJSON',
# rows = 100
)
street_flooding_gdf.crs
<Geographic 2D CRS: EPSG:4326>
Name: WGS 84
Axis Info [ellipsoidal]:
- Lat[north]: Geodetic latitude (degree)
- Lon[east]: Geodetic longitude (degree)
Area of Use:
- undefined
Datum: World Geodetic System 1984
- Ellipsoid: WGS 84
- Prime Meridian: Greenwich
sfc_size = len(street_flooding_gdf)
print(f'# of Street Flooding Complaints: {sfc_size}')
# of Street Flooding Complaints: 35156
Save Street Flooding Complaints
Column Names to nyc_street_flooding_gdf_columns.txt
#
street_flooding_columns = street_flooding_gdf.columns.tolist()
with open(f'{current_script_path}/{nyc_street_flooding_gdf_columns_output}', 'w') as write_f:
for column in street_flooding_columns:
write_f.write(f'{column}\n')
Check Index#
street_flooding_gdf.index
RangeIndex(start=0, stop=35156, step=1)
Set unique_key
as Index#
# test before deciding | street_flooding_gdf.set_index('unique_key', inplace = True)
Verify Index Change#
street_flooding_gdf.index
RangeIndex(start=0, stop=35156, step=1)
Read MapPLUTO
#
Time to read gdb folder: ~37m
print(map_pluto_gdb_folder)
data/PLUTO/MapPLUTO_22v3_1_water_included.gdb
%%script echo "[skip] using gdb file"
map_pluto_shp_folder = 'data/PLUTO/MapPLUTO_22v3_1_water_included.shp'
[skip] using gdb file
pluto_gdf = gpd.read_file(
map_pluto_gdb_folder,
file = 'FileGDB',
engine = 'fiona',
crs = 'EPSG2263',
# rows = 100000
)
Save PLUTO
Column Names to pluto_gdf_columns.txt
#
pluto_columns = pluto_gdf.columns.tolist()
with open(f'{current_script_path}/{pluto_gdf_columns_output}', 'w') as write_f:
for column in pluto_columns:
write_f.write(f'{column}\n')
Check EPSG CRS codes before merging, to ensure proper alignment#
EPSG (European Petroleum Survey Group) Geodetic Parameter Dataset is a respository of coordinate reference systems and transformations datasets, managed by the International Association of Oil and Gas Producers (IOGP). It is widely used in Geographic Information Systems (GIS) to define and describe the spatial reference systems of geographic data. [user Intgr et al., 2019]
CRS (Coordinate Reference System) is used to define and describe the spatial reference systems of geographic data. The CRS defines the origin, scale, and orientation of the coordinate system, as well as how the coordinates are mapped onto a flat surface such as a map or a computer screen. [Cain, 2013]
EPSG: 4326
street_flooding_gdf.crs
<Geographic 2D CRS: EPSG:4326>
Name: WGS 84
Axis Info [ellipsoidal]:
- Lat[north]: Geodetic latitude (degree)
- Lon[east]: Geodetic longitude (degree)
Area of Use:
- undefined
Datum: World Geodetic System 1984
- Ellipsoid: WGS 84
- Prime Meridian: Greenwich
pluto_gdf.crs
<Derived Projected CRS: EPSG:2263>
Name: NAD83 / New York Long Island (ftUS)
Axis Info [cartesian]:
- [east]: Easting (US survey foot)
- [north]: Northing (US survey foot)
Area of Use:
- undefined
Coordinate Operation:
- name: unnamed
- method: Lambert Conic Conformal (2SP)
Datum: North American Datum 1983
- Ellipsoid: GRS 1980
- Prime Meridian: Greenwich
Convert Pluto to EPSG code: 4326#
pluto_gdf.geometry
0 MULTIPOLYGON (((979834.644 191977.863, 980575....
1 MULTIPOLYGON (((973839.736 193894.244, 972500....
2 MULTIPOLYGON (((979286.676 198786.617, 977468....
3 MULTIPOLYGON (((979340.404 199175.516, 977511....
4 MULTIPOLYGON (((981288.593 194289.382, 981164....
...
857209 MULTIPOLYGON (((914466.480 124580.058, 914512....
857210 MULTIPOLYGON (((914321.022 124460.350, 914256....
857211 MULTIPOLYGON (((914318.219 124604.403, 914338....
857212 MULTIPOLYGON (((914740.567 124877.857, 914639....
857213 MULTIPOLYGON (((914842.706 124746.005, 914775....
Name: geometry, Length: 857214, dtype: geometry
pluto_4326_gdf = pluto_gdf.copy()
pluto_4326_gdf['geometry_multipolygon_pluto_x'] = pluto_4326_gdf.apply(lambda x: str(x))
pluto_4326_gdf.geometry
0 MULTIPOLYGON (((979834.644 191977.863, 980575....
1 MULTIPOLYGON (((973839.736 193894.244, 972500....
2 MULTIPOLYGON (((979286.676 198786.617, 977468....
3 MULTIPOLYGON (((979340.404 199175.516, 977511....
4 MULTIPOLYGON (((981288.593 194289.382, 981164....
...
857209 MULTIPOLYGON (((914466.480 124580.058, 914512....
857210 MULTIPOLYGON (((914321.022 124460.350, 914256....
857211 MULTIPOLYGON (((914318.219 124604.403, 914338....
857212 MULTIPOLYGON (((914740.567 124877.857, 914639....
857213 MULTIPOLYGON (((914842.706 124746.005, 914775....
Name: geometry, Length: 857214, dtype: geometry
pluto_4326_gdf['geometry_multipolygon_pluto_x']
0 NaN
1 NaN
2 NaN
3 NaN
4 NaN
...
857209 NaN
857210 NaN
857211 NaN
857212 NaN
857213 NaN
Name: geometry_multipolygon_pluto_x, Length: 857214, dtype: object
pluto_4326_gdf.geometry
0 MULTIPOLYGON (((979834.644 191977.863, 980575....
1 MULTIPOLYGON (((973839.736 193894.244, 972500....
2 MULTIPOLYGON (((979286.676 198786.617, 977468....
3 MULTIPOLYGON (((979340.404 199175.516, 977511....
4 MULTIPOLYGON (((981288.593 194289.382, 981164....
...
857209 MULTIPOLYGON (((914466.480 124580.058, 914512....
857210 MULTIPOLYGON (((914321.022 124460.350, 914256....
857211 MULTIPOLYGON (((914318.219 124604.403, 914338....
857212 MULTIPOLYGON (((914740.567 124877.857, 914639....
857213 MULTIPOLYGON (((914842.706 124746.005, 914775....
Name: geometry, Length: 857214, dtype: geometry
Verify Conversion#
pluto_4326_gdf.crs
<Derived Projected CRS: EPSG:2263>
Name: NAD83 / New York Long Island (ftUS)
Axis Info [cartesian]:
- [east]: Easting (US survey foot)
- [north]: Northing (US survey foot)
Area of Use:
- undefined
Coordinate Operation:
- name: unnamed
- method: Lambert Conic Conformal (2SP)
Datum: North American Datum 1983
- Ellipsoid: GRS 1980
- Prime Meridian: Greenwich
Join #1: BBL#
Check data type of join column#
street_flooding_gdf['bbl'].dtype
dtype('O')
pluto_4326_gdf['BBL'].dtype
dtype('float64')
Convert Street Flooding
GeoDataFrame’s bbl
column from object
to float
.#
street_flooding_gdf['BBL'] = street_flooding_gdf['bbl'].astype(float)
Left Join on BBL
#
Create column to store index before merge#
# street_flooding_gdf['unique_key'] = street_flooding_gdf.index
Preserve multipolygon geometry for the bbl merge#
street_flooding_gdf['geometry_point_street_x'] = street_flooding_gdf.geometry.astype(str)
col_list = pluto_4326_gdf.columns.tolist()
for col in col_list:
if col[:8] == 'geometry':
print(col)
geometry
geometry_multipolygon_pluto_x
pluto_4326_bbl_gdf = pluto_4326_gdf.drop(columns = 'geometry')
street_flooding_pluto_gdf = street_flooding_gdf.merge(
pluto_4326_bbl_gdf,
on = 'BBL',
how = 'left'
)
Preview Results of BBL
Join#
Check index#
street_flooding_pluto_gdf.index
Int64Index([ 0, 1, 2, 3, 4, 5, 6, 7, 8,
9,
...
35146, 35147, 35148, 35149, 35150, 35151, 35152, 35153, 35154,
35155],
dtype='int64', length=35156)
Set unique_key
as index#
# street_flooding_pluto_gdf.set_index('unique_key', inplace = True)
Verify index change#
# street_flooding_gdf.index
List attributes in a 5 column grid#
all_columns = street_flooding_pluto_gdf.columns.tolist()
chunks = [all_columns[x:x + 5] for x in range(0, len(all_columns), 5)]
attribute_grid = pd.DataFrame(chunks)
attribute_grid
0 | 1 | 2 | 3 | 4 | |
---|---|---|---|---|---|
0 | location_state | facility_type | intersection_street_2 | city | location_zip |
1 | park_borough | latitude | road_ramp | created_date | agency |
2 | park_facility_name | location_address | agency_name | descriptor | bbl |
3 | location_city | open_data_channel_type | cross_street_2 | bridge_highway_direction | longitude |
4 | bridge_highway_segment | street_name | incident_address | address_type | incident_zip |
5 | unique_key | complaint_type | y_coordinate_state_plane | status | bridge_highway_name |
6 | location_type | due_date | taxi_company_borough | taxi_pick_up_location | x_coordinate_state_plane |
7 | resolution_description | community_board | resolution_action_updated_date | intersection_street_1 | closed_date |
8 | vehicle_type | cross_street_1 | borough | landmark | geometry |
9 | BBL | geometry_point_street_x | Borough | Block | Lot |
10 | CD | BCT2020 | BCTCB2020 | CT2010 | CB2010 |
11 | SchoolDist | Council | ZipCode | FireComp | PolicePrct |
12 | HealthCenterDistrict | HealthArea | Sanitboro | SanitDistrict | SanitSub |
13 | Address | ZoneDist1 | ZoneDist2 | ZoneDist3 | ZoneDist4 |
14 | Overlay1 | Overlay2 | SPDist1 | SPDist2 | SPDist3 |
15 | LtdHeight | SplitZone | BldgClass | LandUse | Easements |
16 | OwnerType | OwnerName | LotArea | BldgArea | ComArea |
17 | ResArea | OfficeArea | RetailArea | GarageArea | StrgeArea |
18 | FactryArea | OtherArea | AreaSource | NumBldgs | NumFloors |
19 | UnitsRes | UnitsTotal | LotFront | LotDepth | BldgFront |
20 | BldgDepth | Ext | ProxCode | IrrLotCode | LotType |
21 | BsmtCode | AssessLand | AssessTot | ExemptTot | YearBuilt |
22 | YearAlter1 | YearAlter2 | HistDist | Landmark | BuiltFAR |
23 | ResidFAR | CommFAR | FacilFAR | BoroCode | CondoNo |
24 | Tract2010 | XCoord | YCoord | ZoneMap | ZMCode |
25 | Sanborn | TaxMap | EDesigNum | APPBBL | APPDate |
26 | PLUTOMapID | FIRM07_FLAG | PFIRM15_FLAG | Version | DCPEdited |
27 | Latitude | Longitude | Notes | Shape_Leng | Shape_Area |
28 | geometry_multipolygon_pluto_x | None | None | None | None |
preview_columns = [
'geometry_multipolygon_pluto_x',
'created_date',
'borough',
'Borough',
'street_name',
'bbl',
'ZipCode',
# 'geometry_x',
# 'geometry_y'
]
View oldest service requests#
street_flooding_pluto_gdf[preview_columns].head(5)
geometry_multipolygon_pluto_x | created_date | borough | Borough | street_name | bbl | ZipCode | |
---|---|---|---|---|---|---|---|
0 | NaN | 2010-01-02 08:26:00 | BROOKLYN | BK | DARE COURT | 3089000064 | 11229.0 |
1 | NaN | 2010-01-02 12:00:00 | STATEN ISLAND | NaN | NaN | NaN | NaN |
2 | NaN | 2010-01-02 17:45:00 | QUEENS | QN | 146 STREET | 4120050012 | 11436.0 |
3 | NaN | 2010-01-04 16:47:00 | QUEENS | QN | 218 STREET | 4106210008 | 11428.0 |
4 | NaN | 2010-01-05 10:37:00 | BROOKLYN | BK | EAST 63 STREET | 3086550021 | 11234.0 |
View most recent service requests#
street_flooding_pluto_gdf[preview_columns].tail(5)
geometry_multipolygon_pluto_x | created_date | borough | Borough | street_name | bbl | ZipCode | |
---|---|---|---|---|---|---|---|
35151 | NaN | 2023-03-14 14:53:00 | QUEENS | QN | DECOSTA AVENUE | 4160250028 | 11692.0 |
35152 | NaN | 2023-03-14 17:41:00 | QUEENS | QN | 26 AVENUE | 4049150005 | 11358.0 |
35153 | NaN | 2023-03-14 13:47:00 | BROOKLYN | BK | HIMROD STREET | 3032680001 | 11237.0 |
35154 | NaN | 2023-03-14 12:47:00 | QUEENS | NaN | NaN | NaN | NaN |
35155 | NaN | 2023-03-14 13:35:00 | BROOKLYN | BK | BRIGHTON 8 COURT | 3072640043 | 11235.0 |
"""street_flooding_pluto_gdf.to_file(
merge_flood_mappluto_output_ofgdb,
driver = 'OpenFileGDB',
# engine = 'fiona',
# index = True,
# crs = 'EPSG:4326'
# crs = 'EPSG:2263'
)
"""
"street_flooding_pluto_gdf.to_file(\n merge_flood_mappluto_output_ofgdb, \n driver = 'OpenFileGDB', \n # engine = 'fiona',\n # index = True,\n # crs = 'EPSG:4326'\n # crs = 'EPSG:2263'\n)\n\n"
Check % Join by BBL#
def pct_join(gdf: gpd.GeoDataFrame, column_check: str) -> dict:
gdf_join_info = dict()
df_size = len(gdf)
# None does not return the NaN count
# missing_column_check_count = len(gdf[gdf[column_check] == None])
missing_column_check_count = gdf[column_check].isna().sum()
gdf_join_info['df_size'] = df_size
gdf_join_info['match_count'] = df_size - missing_column_check_count
gdf_join_info[f'missing_{(column_check)}_count'] = missing_column_check_count
gdf_join_info['pct_join'] = round((df_size - missing_column_check_count) / df_size * 100, 2)
gdf_join_info['pct_missing'] = round(missing_column_check_count / df_size * 100, 2)
return gdf_join_info
bbl_join_stats_dict = pct_join(street_flooding_pluto_gdf, 'Borough')
bbl_join_stats_dict
{'df_size': 35156,
'match_count': 21612,
'missing_Borough_count': 13544,
'pct_join': 61.47,
'pct_missing': 38.53}
Join #2: Spatial Join#
Split DataFrame Into BBL Matches and Non-matches#
BBL matches#
street_flooding_pluto_bbl_match_gdf = \
street_flooding_pluto_gdf[street_flooding_pluto_gdf['Borough'] != None]
BBL match GeoDataFrame info
street_flooding_pluto_bbl_match_gdf.info()
<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 35156 entries, 0 to 35155
Columns: 141 entries, location_state to geometry_multipolygon_pluto_x
dtypes: datetime64[ns](4), float64(58), geometry(1), object(78)
memory usage: 38.1+ MB
Verify match count
bbl_join_stats_dict
{'df_size': 35156,
'match_count': 21612,
'missing_Borough_count': 13544,
'pct_join': 61.47,
'pct_missing': 38.53}
Note: Revisit calculation for match
match_count = bbl_join_stats_dict['df_size'] - \
bbl_join_stats_dict['missing_Borough_count']
match_count == \
len(street_flooding_pluto_bbl_match_gdf)
False
Add BBL match count to JSON file#
with open(data_stats_json_input, 'r') as read_json:
data_stats = json.load(read_json)
print(data_stats)
{'street_flood_orig': 35156, 'street_flood_clean': 34147, 'street_flood_mappluto_bbl': 35156, 'street_flood_mappluto_sjoin_nearest': 34206}
data_stats['street_flood_mappluto_bbl'] = len(street_flooding_pluto_bbl_match_gdf)
print(data_stats)
{'street_flood_orig': 35156, 'street_flood_clean': 34147, 'street_flood_mappluto_bbl': 35156, 'street_flood_mappluto_sjoin_nearest': 34206}
BBL mismatches#
Before rejoining with sjoin, extract the street flooding rows that did not match by BBL.
street_flooding_pluto_bbl_mismatch_gdf = \
street_flooding_pluto_gdf[street_flooding_pluto_gdf['Borough'].isna()]
street_flooding_pluto_bbl_mismatch_gdf.head()
location_state | facility_type | intersection_street_2 | city | location_zip | park_borough | latitude | road_ramp | created_date | agency | ... | FIRM07_FLAG | PFIRM15_FLAG | Version | DCPEdited | Latitude | Longitude | Notes | Shape_Leng | Shape_Area | geometry_multipolygon_pluto_x | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | N/A | RICHMOND TERRACE | STATEN ISLAND | STATEN ISLAND | 40.63865739388004 | NaN | 2010-01-02 12:00:00 | DEP | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ||
11 | N/A | WEST 20 STREET | NEW YORK | MANHATTAN | 40.74222718212239 | NaN | 2010-01-07 17:56:00 | DEP | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ||
13 | N/A | ADELPHI AVENUE | STATEN ISLAND | STATEN ISLAND | 40.516255348580096 | NaN | 2010-01-09 19:48:00 | DEP | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ||
14 | N/A | 72 AVENUE | Fresh Meadows | QUEENS | 40.72826788677406 | NaN | 2010-01-13 13:30:00 | DEP | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ||
15 | N/A | AVENUE U | BROOKLYN | BROOKLYN | 40.5971705119265 | NaN | 2010-01-13 16:57:00 | DEP | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
5 rows × 141 columns
BBL mismatch GeoDataFrame info
street_flooding_pluto_bbl_mismatch_gdf.info()
<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 13544 entries, 1 to 35154
Columns: 141 entries, location_state to geometry_multipolygon_pluto_x
dtypes: datetime64[ns](4), float64(58), geometry(1), object(78)
memory usage: 14.7+ MB
Verify mismatch count
bbl_join_stats_dict['missing_Borough_count'] == \
len(street_flooding_pluto_bbl_mismatch_gdf)
True
Copy only the street flooding columns
street_flooding_bbl_no_match_gdf = \
street_flooding_pluto_bbl_mismatch_gdf[street_flooding_columns]
Perform sjoin
#
Note: sjoin
is skipped because the remaining street flooding complaint locations did no match the MapPluto dataset. sjoin_nearest
will be used instead. [consider revisiting]
Set active geometry
%%script echo "[skip] Reason: using `sjoin_nearest`"
street_flooding_bbl_no_match_gdf.set_geometry('geometry', inplace = True)
[skip] Reason: using `sjoin_nearest`
%%script echo "[skip] Reason: using `sjoin_nearest`"
street_flooding_map_pluto_sjoin_gdf = (
gpd.sjoin(
street_flooding_bbl_no_match_gdf,
pluto_4326_gdf,
# pluto_4326_gdf_copy_si_only,
how = 'left',
# predicate = 'intersects'
)
)
[skip] Reason: using `sjoin_nearest`
%%script echo "[skip] Reason: using `sjoin_nearest`"
preview_columns_sjoin = [
'borough',
'Borough',
'created_date',
'street_name',
'bbl',
'ZipCode',
'geometry',
'TaxMap'
]
[skip] Reason: using `sjoin_nearest`
%%script echo "[skip] Reason: using `sjoin_nearest`"
street_flooding_map_pluto_sjoin_gdf[preview_columns_sjoin].head(10)
[skip] Reason: using `sjoin_nearest`
%%script echo "[skip] Reason: using `sjoin_nearest`"
street_flooding_map_pluto_sjoin_gdf[preview_columns_sjoin].tail(10)
[skip] Reason: using `sjoin_nearest`
%%script echo "[skip] Reason: using `sjoin_nearest`"
nan_count = street_flooding_map_pluto_sjoin_gdf['Borough'].isna().sum()
nan_count
[skip] Reason: using `sjoin_nearest`
Perform sjoin_nearest
#
Use Projected CRS for gpd.sjoin_nearest
New York, Long Island (EPSG: 2263)
Mercator (EPSG: 3857)
Google Maps
Open Street Maps
Stamen Maps
Set active geometry
Implement geopandas.sjoin_nearest
street_flooding_bbl_no_match_gdf.crs
<Geographic 2D CRS: EPSG:4326>
Name: WGS 84
Axis Info [ellipsoidal]:
- Lat[north]: Geodetic latitude (degree)
- Lon[east]: Geodetic longitude (degree)
Area of Use:
- undefined
Datum: World Geodetic System 1984
- Ellipsoid: WGS 84
- Prime Meridian: Greenwich
pluto_4326_gdf.crs
<Derived Projected CRS: EPSG:2263>
Name: NAD83 / New York Long Island (ftUS)
Axis Info [cartesian]:
- [east]: Easting (US survey foot)
- [north]: Northing (US survey foot)
Area of Use:
- undefined
Coordinate Operation:
- name: unnamed
- method: Lambert Conic Conformal (2SP)
Datum: North American Datum 1983
- Ellipsoid: GRS 1980
- Prime Meridian: Greenwich
col_list = pluto_4326_gdf.columns.tolist()
for col in col_list:
if col[:8] == 'geometry':
print(col)
geometry
geometry_multipolygon_pluto_x
pluto_4326_gdf.geometry
0 MULTIPOLYGON (((979834.644 191977.863, 980575....
1 MULTIPOLYGON (((973839.736 193894.244, 972500....
2 MULTIPOLYGON (((979286.676 198786.617, 977468....
3 MULTIPOLYGON (((979340.404 199175.516, 977511....
4 MULTIPOLYGON (((981288.593 194289.382, 981164....
...
857209 MULTIPOLYGON (((914466.480 124580.058, 914512....
857210 MULTIPOLYGON (((914321.022 124460.350, 914256....
857211 MULTIPOLYGON (((914318.219 124604.403, 914338....
857212 MULTIPOLYGON (((914740.567 124877.857, 914639....
857213 MULTIPOLYGON (((914842.706 124746.005, 914775....
Name: geometry, Length: 857214, dtype: geometry
pluto_4326_sjoin_nearest_gdf = pluto_4326_gdf.drop(columns = 'geometry')
pluto_4326_sjoin_nearest_gdf = pluto_4326_gdf.copy()
street_flooding_map_pluto_sjoin_nearest_gdf = (
gpd.sjoin_nearest(
# Convert CRS to EPSG: 2263
street_flooding_bbl_no_match_gdf.to_crs('2263'),
pluto_4326_sjoin_nearest_gdf, # .to_crs('2263'),
how = 'left',
# max_distance = 100,
distance_col = 'distance'
)
)
Preview sjoin_nearest
results#
Check index
Select columns to preview
preview_columns_sjoin_nearest_list = [
'borough',
'Borough',
'created_date',
'street_name',
'bbl',
'ZipCode',
'geometry',
'TaxMap'
]
street_flooding_map_pluto_sjoin_nearest_gdf[preview_columns_sjoin_nearest_list].head()
borough | Borough | created_date | street_name | bbl | ZipCode | geometry | TaxMap | |
---|---|---|---|---|---|---|---|---|
1 | STATEN ISLAND | SI | 2010-01-02 12:00:00 | NaN | NaN | 10303.0 | POINT (944481.011 171989.064) | 50503 |
11 | MANHATTAN | MN | 2010-01-07 17:56:00 | NaN | NaN | 10011.0 | POINT (985078.930 209690.212) | 10305 |
13 | STATEN ISLAND | SI | 2010-01-09 19:48:00 | NaN | NaN | 10309.0 | POINT (920039.197 127447.098) | 53101 |
14 | QUEENS | QN | 2010-01-13 13:30:00 | NaN | NaN | 11365.0 | POINT (1037695.047 204663.289) | 43103 |
15 | BROOKLYN | BK | 2010-01-13 16:57:00 | NaN | NaN | 11223.0 | POINT (992169.033 156843.127) | 32105 |
street_flooding_map_pluto_sjoin_nearest_gdf[preview_columns_sjoin_nearest_list].tail()
borough | Borough | created_date | street_name | bbl | ZipCode | geometry | TaxMap | |
---|---|---|---|---|---|---|---|---|
35142 | STATEN ISLAND | SI | 2023-03-13 18:11:00 | ARTHUR KILL | NaN | NaN | POINT (919194.141 141918.073) | 52901 |
35143 | BROOKLYN | BK | 2023-03-13 22:35:00 | NaN | NaN | 11230.0 | POINT (996137.026 164552.152) | 32007 |
35144 | BROOKLYN | BK | 2023-03-13 18:29:00 | NaN | NaN | 11249.0 | POINT (994191.964 200891.211) | 30804 |
35147 | QUEENS | QN | 2023-03-14 01:08:00 | BEACH 7 STREET | NaN | 11691.0 | POINT (1055849.100 157337.302) | 45903 |
35154 | QUEENS | QN | 2023-03-14 12:47:00 | NaN | NaN | 11368.0 | POINT (1021682.011 207776.266) | 41102 |
Check number of street flooding complaint locations that did not match after
the sjoin_nearest
implementation
nan_count = street_flooding_map_pluto_sjoin_nearest_gdf['Borough'].isna().sum()
nan_count
1009
sjoin_nearest_stats_dict = \
pct_join(street_flooding_map_pluto_sjoin_nearest_gdf, 'Borough')
sjoin_nearest_stats_dict
{'df_size': 13555,
'match_count': 12546,
'missing_Borough_count': 1009,
'pct_join': 92.56,
'pct_missing': 7.44}
Merge BBL
and sjoin_nearest
GeoPandas DataFrames#
Confirm Both DataFrame Columns Match#
Street Flooding MapPluto BBL
column count#
street_flooding_pluto_bbl_columns_list = street_flooding_pluto_bbl_match_gdf.columns.tolist()
street_flooding_pluto_bbl_columns_count = len(street_flooding_pluto_bbl_columns_list)
print(f'Street Flooding MapPluto BBL column count: {street_flooding_pluto_bbl_columns_count}')
Street Flooding MapPluto BBL column count: 141
Street Flooding MapPluto sjoin_nearest
column count#
street_flooding_map_pluto_sjoin_nearest_columns_list = street_flooding_map_pluto_sjoin_nearest_gdf.columns.tolist()
street_flooding_map_pluto_sjoin_nearest_columns_count = len(street_flooding_map_pluto_sjoin_nearest_columns_list)
print(f'Street Flooding MapPluto sjoin_nearest column count: {street_flooding_map_pluto_sjoin_nearest_columns_count}')
Street Flooding MapPluto sjoin_nearest column count: 142
Compare columns#
bbl_only = set(street_flooding_pluto_bbl_columns_list) - set(street_flooding_map_pluto_sjoin_nearest_columns_list)
bbl_only
{'geometry_point_street_x'}
sjoin_nearest_only = set(street_flooding_map_pluto_sjoin_nearest_columns_list) - set(street_flooding_pluto_bbl_columns_list)
sjoin_nearest_only
{'distance', 'index_right'}
Verify index is the unique_key#
street_flooding_pluto_bbl_match_gdf.index
Int64Index([ 0, 1, 2, 3, 4, 5, 6, 7, 8,
9,
...
35146, 35147, 35148, 35149, 35150, 35151, 35152, 35153, 35154,
35155],
dtype='int64', length=35156)
street_flooding_map_pluto_sjoin_nearest_gdf.index
Int64Index([ 1, 11, 13, 14, 15, 17, 18, 20, 24,
30,
...
35131, 35134, 35136, 35137, 35141, 35142, 35143, 35144, 35147,
35154],
dtype='int64', length=13555)
DataFrame column transform#
BBL match DataFrame
Column align
street_flooding_pluto_bbl_match_col_align_gdf = \
street_flooding_pluto_bbl_match_gdf.drop(columns = ['geometry_point_street_x'], axis = 1)
street_flooding_map_pluto_sjoin_nearest_col_align_gdf = \
street_flooding_map_pluto_sjoin_nearest_gdf.drop(columns = ['distance', 'index_right'], axis = 1)
Re-compare columns#
street_flooding_pluto_bbl_match_col_align_columns_list = street_flooding_pluto_bbl_match_col_align_gdf.columns.tolist()
street_flooding_map_pluto_sjoin_nearest_col_align_columns_list = street_flooding_map_pluto_sjoin_nearest_col_align_gdf.columns.tolist()
BBL
match only
bbl_recheck_only = set(street_flooding_pluto_bbl_match_col_align_columns_list) - set(street_flooding_map_pluto_sjoin_nearest_col_align_columns_list)
bbl_recheck_only
set()
sjoin_nearest
match only
sjoin_nearest_only = set(street_flooding_map_pluto_sjoin_nearest_col_align_columns_list) - set(street_flooding_pluto_bbl_match_col_align_columns_list)
sjoin_nearest_only
set()
for col in street_flooding_pluto_bbl_match_col_align_columns_list:
if col[:8] == 'geometry':
print(col)
geometry
geometry_multipolygon_pluto_x
for col in street_flooding_map_pluto_sjoin_nearest_col_align_columns_list:
if col[:8] == 'geometry':
print(col)
geometry
geometry_multipolygon_pluto_x
Implement Merge#
Set active geometry#
street_flooding_pluto_bbl_match_col_align_gdf.crs
<Geographic 2D CRS: EPSG:4326>
Name: WGS 84
Axis Info [ellipsoidal]:
- Lat[north]: Geodetic latitude (degree)
- Lon[east]: Geodetic longitude (degree)
Area of Use:
- undefined
Datum: World Geodetic System 1984
- Ellipsoid: WGS 84
- Prime Meridian: Greenwich
street_flooding_map_pluto_sjoin_nearest_col_align_gdf.crs
<Derived Projected CRS: EPSG:2263>
Name: NAD83 / New York Long Island (ftUS)
Axis Info [cartesian]:
- X[east]: Easting (US survey foot)
- Y[north]: Northing (US survey foot)
Area of Use:
- name: United States (USA) - New York - counties of Bronx; Kings; Nassau; New York; Queens; Richmond; Suffolk.
- bounds: (-74.26, 40.47, -71.8, 41.3)
Coordinate Operation:
- name: SPCS83 New York Long Island zone (US Survey feet)
- method: Lambert Conic Conformal (2SP)
Datum: North American Datum 1983
- Ellipsoid: GRS 1980
- Prime Meridian: Greenwich
street_flooding_mappluto_merge_gdf = \
pd.concat( \
[
street_flooding_pluto_bbl_match_col_align_gdf.to_crs('2263'),
street_flooding_map_pluto_sjoin_nearest_col_align_gdf # .to_crs('2263')
]
)
Preview Results#
street_flooding_mappluto_merge_gdf.info()
<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 48711 entries, 0 to 35154
Columns: 140 entries, location_state to geometry_multipolygon_pluto_x
dtypes: datetime64[ns](4), float64(58), geometry(1), object(77)
memory usage: 52.4+ MB
preview_columns_merge_list = [
# 'geometry_point_street',
'geometry_multipolygon_pluto_x',
'borough',
'Borough',
'created_date',
'street_name',
'bbl',
'ZipCode',
'geometry',
'Tract2010',
'TaxMap'
]
street_flooding_mappluto_merge_gdf[preview_columns_merge_list].head()
geometry_multipolygon_pluto_x | borough | Borough | created_date | street_name | bbl | ZipCode | geometry | Tract2010 | TaxMap | |
---|---|---|---|---|---|---|---|---|---|---|
0 | NaN | BROOKLYN | BK | 2010-01-02 08:26:00 | DARE COURT | 3089000064 | 11229.0 | POINT (1005975.054 153432.161) | 0628 | 32605 |
1 | NaN | STATEN ISLAND | NaN | 2010-01-02 12:00:00 | NaN | NaN | NaN | POINT (944481.011 171989.064) | NaN | NaN |
2 | NaN | QUEENS | QN | 2010-01-02 17:45:00 | 146 STREET | 4120050012 | 11436.0 | POINT (1041026.066 187595.284) | 0186 | 45202 |
3 | NaN | QUEENS | QN | 2010-01-04 16:47:00 | 218 STREET | 4106210008 | 11428.0 | POINT (1056756.075 201721.315) | 0566 | 44701 |
4 | NaN | BROOKLYN | BK | 2010-01-05 10:37:00 | EAST 63 STREET | 3086550021 | 11234.0 | POINT (1009324.045 162441.183) | 070201 | 32505 |
street_flooding_mappluto_merge_gdf[preview_columns_merge_list].tail()
geometry_multipolygon_pluto_x | borough | Borough | created_date | street_name | bbl | ZipCode | geometry | Tract2010 | TaxMap | |
---|---|---|---|---|---|---|---|---|---|---|
35142 | NaN | STATEN ISLAND | SI | 2023-03-13 18:11:00 | ARTHUR KILL | NaN | NaN | POINT (919194.141 141918.073) | 0226 | 52901 |
35143 | NaN | BROOKLYN | BK | 2023-03-13 22:35:00 | NaN | NaN | 11230.0 | POINT (996137.026 164552.152) | 0538 | 32007 |
35144 | NaN | BROOKLYN | BK | 2023-03-13 18:29:00 | NaN | NaN | 11249.0 | POINT (994191.964 200891.211) | 0555 | 30804 |
35147 | NaN | QUEENS | QN | 2023-03-14 01:08:00 | BEACH 7 STREET | NaN | 11691.0 | POINT (1055849.100 157337.302) | 101002 | 45903 |
35154 | NaN | QUEENS | QN | 2023-03-14 12:47:00 | NaN | NaN | 11368.0 | POINT (1021682.011 207776.266) | 0455 | 41102 |
Check total missing joins#
nan_count = street_flooding_mappluto_merge_gdf['Borough'].isna().sum()
nan_count
14553
street_flooding_mappluto_gdf_merge_stats_dict = \
pct_join(street_flooding_mappluto_merge_gdf, 'Borough')
street_flooding_mappluto_gdf_merge_stats_dict
{'df_size': 48711,
'match_count': 34158,
'missing_Borough_count': 14553,
'pct_join': 70.12,
'pct_missing': 29.88}
Note Revisit None vs NaN count. May needd to sum nan separately.
Reference:
pandas GroupBy columns with NaN (missing) values
# len(street_flooding_mappluto_merge_gdf)
# street_flooding_mappluto_merge_gdf.groupby(['Borough'], dropna = False)['Borough'].count()
Save Final Merge Match Count to JSON file#
data_stats['street_flood_mappluto_sjoin_nearest'] = \
street_flooding_mappluto_gdf_merge_stats_dict['match_count']
print(data_stats)
{'street_flood_orig': 35156, 'street_flood_clean': 34147, 'street_flood_mappluto_bbl': 35156, 'street_flood_mappluto_sjoin_nearest': 34158}
data_stats_json = {}
for key, value in data_stats.items():
data_stats_json[key] = int(value)
with open(data_stats_json_input, 'w') as write_json:
json.dump(data_stats_json, write_json, indent = 4)
Save Merged Dataset#
Reference:
geopandas.GeoDataFrame.to_file
Check supported driver options#
fiona.supported_drivers
{'DXF': 'rw',
'CSV': 'raw',
'OpenFileGDB': 'raw',
'ESRIJSON': 'r',
'ESRI Shapefile': 'raw',
'FlatGeobuf': 'raw',
'GeoJSON': 'raw',
'GeoJSONSeq': 'raw',
'GPKG': 'raw',
'GML': 'rw',
'OGR_GMT': 'rw',
'GPX': 'rw',
'Idrisi': 'r',
'MapInfo File': 'raw',
'DGN': 'raw',
'PCIDSK': 'raw',
'OGR_PDS': 'r',
'S57': 'r',
'SQLite': 'raw',
'TopoJSON': 'r'}
Confirm output folder exists#
if not os.path.exists('data/merge'):
print('Creating new folder for merged street flooding and MapPluto dataset: "data/merge"')
os.makedirs('data/merge')
else:
print('"data/merge" folder already exists')
"data/merge" folder already exists
Reindex Merged GeoPandas DataFrame#
Save as .gdb
using OpenFileGDB
driver#
street_flooding_mappluto_merge_gdf.to_file(
merge_flood_mappluto_output_ofgdb,
driver = 'OpenFileGDB',
# engine = 'fiona',
# crs = 'EPSG:4326'
)
Save as .geojson
using GeoJSON
driver#
street_flooding_mappluto_merge_gdf.to_file(
merge_flood_mappluto_output_geojson,
driver = 'GeoJSON',
# engine = 'fiona',
# crs = 'EPSG:4326'
)
References#
Troubleshooting#
geospatial libraries#
CRSError: Invalid projection: EPSG:4326 : #2562
GDAL ImportError: DLL load failed: The specified module could not be found
GeoPandas DataFrames and categorical data#
BUG: to_file cannot write dataframes containing categorical data #1883
Solution:
gdf[<new_column>] = gdf.geometry.astype(str)
.merge() keep returning pandas dataframe due to duplicate geometry columns #824