{
"cells": [
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"# Clean Data\n",
"\n",
""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Import Libraries"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"### Standard Libraries\n"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import json"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"For more on working with `json` in Python, see {cite}`lofaro2018json`."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### External Libraries"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import geopandas as gpd"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Define Variables"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"nyc_street_flooding_input = 'data/street-flooding/street-flood-complaints_rows-all.geojson'\n",
"nyc_street_flooding_output = 'data/street-flooding/clean_street-flood-complaints_rows-all.geojson'\n",
"data_stats_json_output = 'data/data-stats.json'"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Get Original Data"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"street_flooding_gdf = gpd.read_file(nyc_street_flooding_input)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Before Count"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"There were 35,056 street flooding complaints from 2010 to the present.\n"
]
}
],
"source": [
"street_flooding_complaints_before_count = len(street_flooding_gdf)\n",
"print(f'There were {street_flooding_complaints_before_count:,} street flooding complaints from 2010 to the present.')"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## Set `unique_key` as Index"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"street_flooding_gdf.set_index('unique_key', inplace=True)"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## Remove Rows With Missing `geometry`"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"street_flooding_gdf.dropna(subset = ['geometry'], inplace = True)"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## After Count"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"There were 34,049 street flooding complaints after rows with missing geometry have been removed.\n"
]
}
],
"source": [
"street_flooding_complaints_after_count = len(street_flooding_gdf)\n",
"print(f'There were {street_flooding_complaints_after_count:,} street flooding complaints after rows with missing geometry have been removed.')"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## Preview Street Flooding Data"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" created_date | \n",
" borough | \n",
" bbl | \n",
" geometry | \n",
"
\n",
" \n",
" unique_key | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 15639934 | \n",
" 2010-01-02 08:26:00 | \n",
" BROOKLYN | \n",
" 3089000064 | \n",
" POINT (-73.92178 40.58778) | \n",
"
\n",
" \n",
" 15640572 | \n",
" 2010-01-02 12:00:00 | \n",
" STATEN ISLAND | \n",
" NaN | \n",
" POINT (-74.14329 40.63866) | \n",
"
\n",
" \n",
" 15640664 | \n",
" 2010-01-02 17:45:00 | \n",
" QUEENS | \n",
" 4120050012 | \n",
" POINT (-73.79530 40.68140) | \n",
"
\n",
" \n",
" 15655327 | \n",
" 2010-01-04 16:47:00 | \n",
" QUEENS | \n",
" 4106210008 | \n",
" POINT (-73.73843 40.72006) | \n",
"
\n",
" \n",
" 15668560 | \n",
" 2010-01-05 10:37:00 | \n",
" BROOKLYN | \n",
" 3086550021 | \n",
" POINT (-73.90969 40.61250) | \n",
"
\n",
" \n",
" 15674300 | \n",
" 2010-01-06 19:26:00 | \n",
" BROOKLYN | \n",
" 3029270015 | \n",
" POINT (-73.93297 40.71584) | \n",
"
\n",
" \n",
" 15674896 | \n",
" 2010-01-06 08:24:00 | \n",
" QUEENS | \n",
" 4119960122 | \n",
" POINT (-73.80255 40.67925) | \n",
"
\n",
" \n",
" 15674924 | \n",
" 2010-01-06 09:17:00 | \n",
" STATEN ISLAND | \n",
" 5040740044 | \n",
" POINT (-74.10646 40.55866) | \n",
"
\n",
" \n",
" 15675505 | \n",
" 2010-01-06 06:00:00 | \n",
" QUEENS | \n",
" 4030030044 | \n",
" POINT (-73.87694 40.71804) | \n",
"
\n",
" \n",
" 15683503 | \n",
" 2010-01-07 10:16:00 | \n",
" STATEN ISLAND | \n",
" 5014850078 | \n",
" POINT (-74.14943 40.61979) | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" created_date borough bbl \\\n",
"unique_key \n",
"15639934 2010-01-02 08:26:00 BROOKLYN 3089000064 \n",
"15640572 2010-01-02 12:00:00 STATEN ISLAND NaN \n",
"15640664 2010-01-02 17:45:00 QUEENS 4120050012 \n",
"15655327 2010-01-04 16:47:00 QUEENS 4106210008 \n",
"15668560 2010-01-05 10:37:00 BROOKLYN 3086550021 \n",
"15674300 2010-01-06 19:26:00 BROOKLYN 3029270015 \n",
"15674896 2010-01-06 08:24:00 QUEENS 4119960122 \n",
"15674924 2010-01-06 09:17:00 STATEN ISLAND 5040740044 \n",
"15675505 2010-01-06 06:00:00 QUEENS 4030030044 \n",
"15683503 2010-01-07 10:16:00 STATEN ISLAND 5014850078 \n",
"\n",
" geometry \n",
"unique_key \n",
"15639934 POINT (-73.92178 40.58778) \n",
"15640572 POINT (-74.14329 40.63866) \n",
"15640664 POINT (-73.79530 40.68140) \n",
"15655327 POINT (-73.73843 40.72006) \n",
"15668560 POINT (-73.90969 40.61250) \n",
"15674300 POINT (-73.93297 40.71584) \n",
"15674896 POINT (-73.80255 40.67925) \n",
"15674924 POINT (-74.10646 40.55866) \n",
"15675505 POINT (-73.87694 40.71804) \n",
"15683503 POINT (-74.14943 40.61979) "
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"street_flooding_gdf[['created_date', 'borough', 'bbl', 'geometry']].head(10)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" created_date | \n",
" borough | \n",
" bbl | \n",
" geometry | \n",
"
\n",
" \n",
" unique_key | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 56894127 | \n",
" 2023-02-25 21:17:00 | \n",
" QUEENS | \n",
" 4066360043 | \n",
" POINT (-73.82293 40.71523) | \n",
"
\n",
" \n",
" 56895026 | \n",
" 2023-02-25 12:47:00 | \n",
" QUEENS | \n",
" 4067470075 | \n",
" POINT (-73.81219 40.73705) | \n",
"
\n",
" \n",
" 56899909 | \n",
" 2023-02-25 20:08:00 | \n",
" BROOKLYN | \n",
" 3056230001 | \n",
" POINT (-73.99062 40.63595) | \n",
"
\n",
" \n",
" 56900879 | \n",
" 2023-02-26 09:08:00 | \n",
" QUEENS | \n",
" 4015360120 | \n",
" POINT (-73.88446 40.73925) | \n",
"
\n",
" \n",
" 56904542 | \n",
" 2023-02-26 18:05:00 | \n",
" STATEN ISLAND | \n",
" 5061080026 | \n",
" POINT (-74.20391 40.54321) | \n",
"
\n",
" \n",
" 56909777 | \n",
" 2023-02-27 12:26:00 | \n",
" QUEENS | \n",
" 4046820038 | \n",
" POINT (-73.81259 40.78476) | \n",
"
\n",
" \n",
" 56911030 | \n",
" 2023-02-27 15:38:00 | \n",
" BROOKLYN | \n",
" 3078620043 | \n",
" POINT (-73.93696 40.61734) | \n",
"
\n",
" \n",
" 56913386 | \n",
" 2023-02-27 17:47:00 | \n",
" BROOKLYN | \n",
" 3056220059 | \n",
" POINT (-73.99228 40.63696) | \n",
"
\n",
" \n",
" 56914818 | \n",
" 2023-02-27 08:17:00 | \n",
" QUEENS | \n",
" 4051937501 | \n",
" POINT (-73.82129 40.75430) | \n",
"
\n",
" \n",
" 56915899 | \n",
" 2023-02-27 10:23:00 | \n",
" QUEENS | \n",
" 4137350027 | \n",
" POINT (-73.74750 40.65427) | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" created_date borough bbl \\\n",
"unique_key \n",
"56894127 2023-02-25 21:17:00 QUEENS 4066360043 \n",
"56895026 2023-02-25 12:47:00 QUEENS 4067470075 \n",
"56899909 2023-02-25 20:08:00 BROOKLYN 3056230001 \n",
"56900879 2023-02-26 09:08:00 QUEENS 4015360120 \n",
"56904542 2023-02-26 18:05:00 STATEN ISLAND 5061080026 \n",
"56909777 2023-02-27 12:26:00 QUEENS 4046820038 \n",
"56911030 2023-02-27 15:38:00 BROOKLYN 3078620043 \n",
"56913386 2023-02-27 17:47:00 BROOKLYN 3056220059 \n",
"56914818 2023-02-27 08:17:00 QUEENS 4051937501 \n",
"56915899 2023-02-27 10:23:00 QUEENS 4137350027 \n",
"\n",
" geometry \n",
"unique_key \n",
"56894127 POINT (-73.82293 40.71523) \n",
"56895026 POINT (-73.81219 40.73705) \n",
"56899909 POINT (-73.99062 40.63595) \n",
"56900879 POINT (-73.88446 40.73925) \n",
"56904542 POINT (-74.20391 40.54321) \n",
"56909777 POINT (-73.81259 40.78476) \n",
"56911030 POINT (-73.93696 40.61734) \n",
"56913386 POINT (-73.99228 40.63696) \n",
"56914818 POINT (-73.82129 40.75430) \n",
"56915899 POINT (-73.74750 40.65427) "
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"street_flooding_gdf[['created_date', 'borough', 'bbl', 'geometry']].tail(10)"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## Save Datasets"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"### Save Street Flooding GeoDataFrame"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"street_flooding_gdf.to_file(nyc_street_flooding_output, driver='GeoJSON')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Save Counts to JSON file"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"gdf_counts = {\n",
" \"street_flood_orig\": street_flooding_complaints_before_count,\n",
" \"street_flood_clean\": street_flooding_complaints_after_count\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"with open(data_stats_json_output, 'w') as write_json:\n",
" json.dump(gdf_counts, write_json, indent = 4)"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## References\n",
"\n",
"### JSON\n",
"\n",
"[Working With JSON Data in Python| Real Python](https://realpython.com/python-json/)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "nyc-street-flooding-analysis",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.0"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "8faf5fcf33adbeb030828c995532f49ce6aeec881dbe0c2746b63b3efb333c96"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}