from learntools.core import binder
binder.bind(globals())
from learntools.data_cleaning.ex3 import *
print("Setup Complete")

Setup Complete


# modules we'll use
import pandas as pd
import numpy as np
import seaborn as sns
import datetime

# read in our data
earthquakes = pd.read_csv("../input/earthquake-database/database.csv")

# set seed for reproducibility
np.random.seed(0)


# TODO: Your code here!
earthquakes['Date'].head()

0    01/02/1965
1    01/04/1965
2    01/05/1965
3    01/08/1965
4    01/09/1965
Name: Date, dtype: object


# Check your answer (Run this code cell to receive credit!)
q1.check()


# Line below will give you a hint
#q1.hint()


earthquakes[3378:3383]


date_lengths = earthquakes.Date.str.len()
date_lengths.value_counts()

10    23409
24        3
Name: Date, dtype: int64


indices = np.where([date_lengths == 24])[1]
print('Indices with corrupted data:', indices)
earthquakes.loc[indices]

Indices with corrupted data: [ 3378  7512 20650]


# TODO: Your code here
earthquakes.loc[3378, "Date"] = "02/23/1975"
earthquakes.loc[7512, "Date"] = "04/28/1985"
earthquakes.loc[20650, "Date"] = "03/13/2011"

earthquakes['date_parsed'] = pd.to_datetime(earthquakes['Date'], format="%m/%d/%Y")

# Check your answer
q2.check()


# Lines below will give you a hint or solution code
q2.hint()
q2.solution()

earthquakes.loc[3378, "Date"] = "02/23/1975"
earthquakes.loc[7512, "Date"] = "04/28/1985"
earthquakes.loc[20650, "Date"] = "03/13/2011"
earthquakes['date_parsed'] = pd.to_datetime(earthquakes['Date'], format="%m/%d/%Y")


# try to get the day of the month from the date column
day_of_month_earthquakes = earthquakes['date_parsed'].dt.day

# Check your answer
q3.check()


# Lines below will give you a hint or solution code
#q3.hint()
#q3.solution()


# TODO: Your code here!
day_of_the_month = earthquakes['date_parsed'].dt.day
sns.histplot(day_of_the_month, kde=False, bins=31)

<AxesSubplot:xlabel='date_parsed', ylabel='Count'>


# Check your answer (Run this code cell to receive credit!)
q4.check()


# Line below will give you a hint
#q4.hint()


volcanos = pd.read_csv("../input/volcanic-eruptions/database.csv")


volcanos['Last Known Eruption'].sample(5)

556      Unknown
557      Unknown
530     3500 BCE
130      1863 CE
1179     Unknown
Name: Last Known Eruption, dtype: object


volcano_eruptions_before_common_era = volcanos[volcanos['Last Known Eruption'].str.contains(' BCE')]
volcano_eruptions_common_era = volcanos[volcanos['Last Known Eruption'].str.contains(' CE')]
unknown_volcano_eruptions = volcanos[volcanos['Last Known Eruption'] == 'Unknown']

total_amount_of_data_from_all_the_groups = len(volcano_eruptions_before_common_era) + len(volcano_eruptions_common_era) + len(unknown_volcano_eruptions)
if len(volcanos) != total_amount_of_data_from_all_the_groups:
    print("Some data is missing")


print("BCE amount: {} this means a {:.2f}% of the dataset".format(volcano_eruptions_before_common_era.shape[0], (volcano_eruptions_before_common_era.shape[0]/total_amount_of_data_from_all_the_groups) * 100))
print("CE amount: {} this means a {:.2f}% of the dataset".format(volcano_eruptions_common_era.shape[0], (volcano_eruptions_common_era.shape[0]/total_amount_of_data_from_all_the_groups) * 100))
print("Unknown amount: {} this means a {:.2f}% of the dataset".format(unknown_volcano_eruptions.shape[0], (unknown_volcano_eruptions.shape[0]/total_amount_of_data_from_all_the_groups) * 100))

BCE amount: 174 this means a 11.54% of the dataset
CE amount: 697 this means a 46.22% of the dataset
Unknown amount: 637 this means a 42.24% of the dataset

	Date	Time	Latitude	Longitude	Type	Depth	Depth Error	Depth Seismic Stations	Magnitude	Magnitude Type	...	Magnitude Seismic Stations	Azimuthal Gap	Horizontal Distance	Horizontal Error	Root Mean Square	ID	Source	Location Source	Magnitude Source	Status
3378	1975-02-23T02:58:41.000Z	1975-02-23T02:58:41.000Z	8.017	124.075	Earthquake	623.0	NaN	NaN	5.6	MB	...	NaN	NaN	NaN	NaN	NaN	USP0000A09	US	US	US	Reviewed
3379	02/23/1975	03:53:36	-21.727	-71.356	Earthquake	33.0	NaN	NaN	5.6	MB	...	NaN	NaN	NaN	NaN	NaN	USP0000A0A	US	US	US	Reviewed
3380	02/23/1975	07:34:11	-10.879	166.667	Earthquake	33.0	NaN	NaN	5.5	MS	...	NaN	NaN	NaN	NaN	NaN	USP0000A0C	US	US	US	Reviewed
3381	02/25/1975	05:20:05	-7.388	149.798	Earthquake	33.0	NaN	NaN	5.5	MB	...	NaN	NaN	NaN	NaN	NaN	USP0000A12	US	US	US	Reviewed
3382	02/26/1975	04:48:55	85.047	97.969	Earthquake	33.0	NaN	NaN	5.6	MS	...	NaN	NaN	NaN	NaN	NaN	USP0000A1H	US	US	US	Reviewed

	Date	Time	Latitude	Longitude	Type	Depth	Depth Error	Depth Seismic Stations	Magnitude	Magnitude Type	...	Magnitude Seismic Stations	Azimuthal Gap	Horizontal Distance	Horizontal Error	Root Mean Square	ID	Source	Location Source	Magnitude Source	Status
3378	1975-02-23T02:58:41.000Z	1975-02-23T02:58:41.000Z	8.017	124.075	Earthquake	623.0	NaN	NaN	5.6	MB	...	NaN	NaN	NaN	NaN	NaN	USP0000A09	US	US	US	Reviewed
7512	1985-04-28T02:53:41.530Z	1985-04-28T02:53:41.530Z	-32.998	-71.766	Earthquake	33.0	NaN	NaN	5.6	MW	...	NaN	NaN	NaN	NaN	1.30	USP0002E81	US	US	HRV	Reviewed
20650	2011-03-13T02:23:34.520Z	2011-03-13T02:23:34.520Z	36.344	142.344	Earthquake	10.1	13.9	289.0	5.8	MWC	...	NaN	32.3	NaN	NaN	1.06	USP000HWQP	US	US	GCMT	Reviewed

Setup¶

Get our environment set up¶

1) Check the data type of our date column¶

2) Convert our date columns to datetime¶

3) Select the day of the month¶

4) Plot the day of the month to check the date parsing¶

(Optional) Bonus Challenge¶

Count the total amoutn of volcano eruptions BCE, CE and Unknown¶

(Optional) More practice¶

Keep going¶