{ "cells": [ { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "# Clean Data\n", "\n", "![clean-merge-data](img/scrub-process-diagram.png)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Import Libraries" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "### Standard Libraries\n" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import json" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "For more on working with `json` in Python, see {cite}`lofaro2018json`." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### External Libraries" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import geopandas as gpd" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Define Variables" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "nyc_street_flooding_input = 'data/street-flooding/street-flood-complaints_rows-all.geojson'\n", "nyc_street_flooding_output = 'data/street-flooding/clean_street-flood-complaints_rows-all.geojson'\n", "data_stats_json_output = 'data/data-stats.json'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Get Original Data" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "street_flooding_gdf = gpd.read_file(nyc_street_flooding_input)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Before Count" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "There were 35,056 street flooding complaints from 2010 to the present.\n" ] } ], "source": [ "street_flooding_complaints_before_count = len(street_flooding_gdf)\n", "print(f'There were {street_flooding_complaints_before_count:,} street flooding complaints from 2010 to the present.')" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## Set `unique_key` as Index" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "street_flooding_gdf.set_index('unique_key', inplace=True)" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## Remove Rows With Missing `geometry`" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "street_flooding_gdf.dropna(subset = ['geometry'], inplace = True)" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## After Count" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "There were 34,049 street flooding complaints after rows with missing geometry have been removed.\n" ] } ], "source": [ "street_flooding_complaints_after_count = len(street_flooding_gdf)\n", "print(f'There were {street_flooding_complaints_after_count:,} street flooding complaints after rows with missing geometry have been removed.')" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## Preview Street Flooding Data" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
created_dateboroughbblgeometry
unique_key
156399342010-01-02 08:26:00BROOKLYN3089000064POINT (-73.92178 40.58778)
156405722010-01-02 12:00:00STATEN ISLANDNaNPOINT (-74.14329 40.63866)
156406642010-01-02 17:45:00QUEENS4120050012POINT (-73.79530 40.68140)
156553272010-01-04 16:47:00QUEENS4106210008POINT (-73.73843 40.72006)
156685602010-01-05 10:37:00BROOKLYN3086550021POINT (-73.90969 40.61250)
156743002010-01-06 19:26:00BROOKLYN3029270015POINT (-73.93297 40.71584)
156748962010-01-06 08:24:00QUEENS4119960122POINT (-73.80255 40.67925)
156749242010-01-06 09:17:00STATEN ISLAND5040740044POINT (-74.10646 40.55866)
156755052010-01-06 06:00:00QUEENS4030030044POINT (-73.87694 40.71804)
156835032010-01-07 10:16:00STATEN ISLAND5014850078POINT (-74.14943 40.61979)
\n", "
" ], "text/plain": [ " created_date borough bbl \\\n", "unique_key \n", "15639934 2010-01-02 08:26:00 BROOKLYN 3089000064 \n", "15640572 2010-01-02 12:00:00 STATEN ISLAND NaN \n", "15640664 2010-01-02 17:45:00 QUEENS 4120050012 \n", "15655327 2010-01-04 16:47:00 QUEENS 4106210008 \n", "15668560 2010-01-05 10:37:00 BROOKLYN 3086550021 \n", "15674300 2010-01-06 19:26:00 BROOKLYN 3029270015 \n", "15674896 2010-01-06 08:24:00 QUEENS 4119960122 \n", "15674924 2010-01-06 09:17:00 STATEN ISLAND 5040740044 \n", "15675505 2010-01-06 06:00:00 QUEENS 4030030044 \n", "15683503 2010-01-07 10:16:00 STATEN ISLAND 5014850078 \n", "\n", " geometry \n", "unique_key \n", "15639934 POINT (-73.92178 40.58778) \n", "15640572 POINT (-74.14329 40.63866) \n", "15640664 POINT (-73.79530 40.68140) \n", "15655327 POINT (-73.73843 40.72006) \n", "15668560 POINT (-73.90969 40.61250) \n", "15674300 POINT (-73.93297 40.71584) \n", "15674896 POINT (-73.80255 40.67925) \n", "15674924 POINT (-74.10646 40.55866) \n", "15675505 POINT (-73.87694 40.71804) \n", "15683503 POINT (-74.14943 40.61979) " ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "street_flooding_gdf[['created_date', 'borough', 'bbl', 'geometry']].head(10)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
created_dateboroughbblgeometry
unique_key
568941272023-02-25 21:17:00QUEENS4066360043POINT (-73.82293 40.71523)
568950262023-02-25 12:47:00QUEENS4067470075POINT (-73.81219 40.73705)
568999092023-02-25 20:08:00BROOKLYN3056230001POINT (-73.99062 40.63595)
569008792023-02-26 09:08:00QUEENS4015360120POINT (-73.88446 40.73925)
569045422023-02-26 18:05:00STATEN ISLAND5061080026POINT (-74.20391 40.54321)
569097772023-02-27 12:26:00QUEENS4046820038POINT (-73.81259 40.78476)
569110302023-02-27 15:38:00BROOKLYN3078620043POINT (-73.93696 40.61734)
569133862023-02-27 17:47:00BROOKLYN3056220059POINT (-73.99228 40.63696)
569148182023-02-27 08:17:00QUEENS4051937501POINT (-73.82129 40.75430)
569158992023-02-27 10:23:00QUEENS4137350027POINT (-73.74750 40.65427)
\n", "
" ], "text/plain": [ " created_date borough bbl \\\n", "unique_key \n", "56894127 2023-02-25 21:17:00 QUEENS 4066360043 \n", "56895026 2023-02-25 12:47:00 QUEENS 4067470075 \n", "56899909 2023-02-25 20:08:00 BROOKLYN 3056230001 \n", "56900879 2023-02-26 09:08:00 QUEENS 4015360120 \n", "56904542 2023-02-26 18:05:00 STATEN ISLAND 5061080026 \n", "56909777 2023-02-27 12:26:00 QUEENS 4046820038 \n", "56911030 2023-02-27 15:38:00 BROOKLYN 3078620043 \n", "56913386 2023-02-27 17:47:00 BROOKLYN 3056220059 \n", "56914818 2023-02-27 08:17:00 QUEENS 4051937501 \n", "56915899 2023-02-27 10:23:00 QUEENS 4137350027 \n", "\n", " geometry \n", "unique_key \n", "56894127 POINT (-73.82293 40.71523) \n", "56895026 POINT (-73.81219 40.73705) \n", "56899909 POINT (-73.99062 40.63595) \n", "56900879 POINT (-73.88446 40.73925) \n", "56904542 POINT (-74.20391 40.54321) \n", "56909777 POINT (-73.81259 40.78476) \n", "56911030 POINT (-73.93696 40.61734) \n", "56913386 POINT (-73.99228 40.63696) \n", "56914818 POINT (-73.82129 40.75430) \n", "56915899 POINT (-73.74750 40.65427) " ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "street_flooding_gdf[['created_date', 'borough', 'bbl', 'geometry']].tail(10)" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## Save Datasets" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "### Save Street Flooding GeoDataFrame" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "street_flooding_gdf.to_file(nyc_street_flooding_output, driver='GeoJSON')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Save Counts to JSON file" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "gdf_counts = {\n", " \"street_flood_orig\": street_flooding_complaints_before_count,\n", " \"street_flood_clean\": street_flooding_complaints_after_count\n", "}" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "with open(data_stats_json_output, 'w') as write_json:\n", " json.dump(gdf_counts, write_json, indent = 4)" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## References\n", "\n", "### JSON\n", "\n", "[Working With JSON Data in Python| Real Python](https://realpython.com/python-json/)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [] } ], "metadata": { "kernelspec": { "display_name": "nyc-street-flooding-analysis", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.0" }, "orig_nbformat": 4, "vscode": { "interpreter": { "hash": "8faf5fcf33adbeb030828c995532f49ce6aeec881dbe0c2746b63b3efb333c96" } } }, "nbformat": 4, "nbformat_minor": 2 }