{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "**This notebook is an exercise in the [Data Cleaning](https://www.kaggle.com/learn/data-cleaning) course. You can reference the tutorial at [this link](https://www.kaggle.com/alexisbcook/handling-missing-values).**\n", "\n", "---\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "In this exercise, you'll apply what you learned in the **Handling missing values** tutorial.\n", "\n", "# Setup\n", "\n", "The questions below will give you feedback on your work. Run the following cell to set up the feedback system." ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "execution": { "iopub.execute_input": "2021-07-01T17:24:04.967259Z", "iopub.status.busy": "2021-07-01T17:24:04.966426Z", "iopub.status.idle": "2021-07-01T17:24:09.349378Z", "shell.execute_reply": "2021-07-01T17:24:09.347725Z", "shell.execute_reply.started": "2021-07-01T17:24:04.967124Z" } }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/opt/conda/lib/python3.7/site-packages/IPython/core/interactiveshell.py:3347: DtypeWarning: Columns (22,32) have mixed types.Specify dtype option on import or set low_memory=False.\n", " if (await self.run_code(code, result, async_=asy)):\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Setup Complete\n" ] } ], "source": [ "from learntools.core import binder\n", "binder.bind(globals())\n", "from learntools.data_cleaning.ex1 import *\n", "print(\"Setup Complete\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 1) Take a first look at the data\n", "\n", "Run the next code cell to load in the libraries and dataset you'll use to complete the exercise." ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "execution": { "iopub.execute_input": "2021-07-01T17:24:09.352245Z", "iopub.status.busy": "2021-07-01T17:24:09.351782Z", "iopub.status.idle": "2021-07-01T17:24:11.340420Z", "shell.execute_reply": "2021-07-01T17:24:11.339137Z", "shell.execute_reply.started": "2021-07-01T17:24:09.352199Z" } }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/opt/conda/lib/python3.7/site-packages/IPython/core/interactiveshell.py:3156: DtypeWarning: Columns (22,32) have mixed types.Specify dtype option on import or set low_memory=False.\n", " interactivity=interactivity, compiler=compiler, result=result)\n" ] } ], "source": [ "# modules we'll use\n", "import pandas as pd\n", "import numpy as np\n", "\n", "# read in all our data\n", "sf_permits = pd.read_csv(\"../input/building-permit-applications-data/Building_Permits.csv\")\n", "\n", "# set seed for reproducibility\n", "np.random.seed(0) " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Use the code cell below to print the first five rows of the `sf_permits` DataFrame." ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "execution": { "iopub.execute_input": "2021-07-01T17:24:11.342107Z", "iopub.status.busy": "2021-07-01T17:24:11.341802Z", "iopub.status.idle": "2021-07-01T17:24:11.384350Z", "shell.execute_reply": "2021-07-01T17:24:11.382757Z", "shell.execute_reply.started": "2021-07-01T17:24:11.342078Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", " | Permit Number | \n", "Permit Type | \n", "Permit Type Definition | \n", "Permit Creation Date | \n", "Block | \n", "Lot | \n", "Street Number | \n", "Street Number Suffix | \n", "Street Name | \n", "Street Suffix | \n", "... | \n", "Existing Construction Type | \n", "Existing Construction Type Description | \n", "Proposed Construction Type | \n", "Proposed Construction Type Description | \n", "Site Permit | \n", "Supervisor District | \n", "Neighborhoods - Analysis Boundaries | \n", "Zipcode | \n", "Location | \n", "Record ID | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "201505065519 | \n", "4 | \n", "sign - erect | \n", "05/06/2015 | \n", "0326 | \n", "023 | \n", "140 | \n", "NaN | \n", "Ellis | \n", "St | \n", "... | \n", "3.0 | \n", "constr type 3 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "3.0 | \n", "Tenderloin | \n", "94102.0 | \n", "(37.785719256680785, -122.40852313194863) | \n", "1380611233945 | \n", "
1 | \n", "201604195146 | \n", "4 | \n", "sign - erect | \n", "04/19/2016 | \n", "0306 | \n", "007 | \n", "440 | \n", "NaN | \n", "Geary | \n", "St | \n", "... | \n", "3.0 | \n", "constr type 3 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "3.0 | \n", "Tenderloin | \n", "94102.0 | \n", "(37.78733980600732, -122.41063199757738) | \n", "1420164406718 | \n", "
2 | \n", "201605278609 | \n", "3 | \n", "additions alterations or repairs | \n", "05/27/2016 | \n", "0595 | \n", "203 | \n", "1647 | \n", "NaN | \n", "Pacific | \n", "Av | \n", "... | \n", "1.0 | \n", "constr type 1 | \n", "1.0 | \n", "constr type 1 | \n", "NaN | \n", "3.0 | \n", "Russian Hill | \n", "94109.0 | \n", "(37.7946573324287, -122.42232562979227) | \n", "1424856504716 | \n", "
3 | \n", "201611072166 | \n", "8 | \n", "otc alterations permit | \n", "11/07/2016 | \n", "0156 | \n", "011 | \n", "1230 | \n", "NaN | \n", "Pacific | \n", "Av | \n", "... | \n", "5.0 | \n", "wood frame (5) | \n", "5.0 | \n", "wood frame (5) | \n", "NaN | \n", "3.0 | \n", "Nob Hill | \n", "94109.0 | \n", "(37.79595867909168, -122.41557405519474) | \n", "1443574295566 | \n", "
4 | \n", "201611283529 | \n", "6 | \n", "demolitions | \n", "11/28/2016 | \n", "0342 | \n", "001 | \n", "950 | \n", "NaN | \n", "Market | \n", "St | \n", "... | \n", "3.0 | \n", "constr type 3 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "6.0 | \n", "Tenderloin | \n", "94102.0 | \n", "(37.78315261897309, -122.40950883997789) | \n", "144548169992 | \n", "
5 rows × 43 columns
\n", "\n", " | Permit Number | \n", "Permit Type | \n", "Permit Type Definition | \n", "Permit Creation Date | \n", "Block | \n", "Lot | \n", "Street Number | \n", "Street Number Suffix | \n", "Street Name | \n", "Street Suffix | \n", "... | \n", "Existing Construction Type | \n", "Existing Construction Type Description | \n", "Proposed Construction Type | \n", "Proposed Construction Type Description | \n", "Site Permit | \n", "Supervisor District | \n", "Neighborhoods - Analysis Boundaries | \n", "Zipcode | \n", "Location | \n", "Record ID | \n", "
---|
0 rows × 43 columns
\n", "