In [1]:
# Import statements
import pandas as pd
import numpy as np
import seaborn as sns
import datetime
from matplotlib import pyplot as plt
In [3]:
# Read in the 2018 data.
df = pd.read_csv('eda_structuring_with_python_dataset1.csv')
df.head()
Out[3]:
| date | number_of_strikes | center_point_geom | |
|---|---|---|---|
| 0 | 2018-01-03 | 194 | POINT(-75 27) |
| 1 | 2018-01-03 | 41 | POINT(-78.4 29) |
| 2 | 2018-01-03 | 33 | POINT(-73.9 27) |
| 3 | 2018-01-03 | 38 | POINT(-73.8 27) |
| 4 | 2018-01-03 | 92 | POINT(-79 28) |
In [4]:
# Convert the `date` column to datetime.
df['date'] = pd.to_datetime(df['date'])
In [5]:
df.shape
Out[5]:
(3401012, 3)
In [7]:
##Check for duplicates##
df.drop_duplicates().shape
Out[7]:
(3401012, 3)
In [8]:
# Sort by number of strikes in descending order.
df.sort_values(by='number_of_strikes', ascending=False).head(10)
Out[8]:
| date | number_of_strikes | center_point_geom | |
|---|---|---|---|
| 302758 | 2018-08-20 | 2211 | POINT(-92.5 35.5) |
| 278383 | 2018-08-16 | 2142 | POINT(-96.1 36.1) |
| 280830 | 2018-08-17 | 2061 | POINT(-90.2 36.1) |
| 280453 | 2018-08-17 | 2031 | POINT(-89.9 35.9) |
| 278382 | 2018-08-16 | 1902 | POINT(-96.2 36.1) |
| 11517 | 2018-02-10 | 1899 | POINT(-95.5 28.1) |
| 277506 | 2018-08-16 | 1878 | POINT(-89.7 31.5) |
| 24906 | 2018-02-25 | 1833 | POINT(-98.7 28.9) |
| 284320 | 2018-08-17 | 1767 | POINT(-90.1 36) |
| 24825 | 2018-02-25 | 1741 | POINT(-98 29) |
In [9]:
# Identify the locations that appear most in the dataset.
df.center_point_geom.value_counts()
Out[9]:
center_point_geom
POINT(-81.5 22.5) 108
POINT(-84.1 22.4) 108
POINT(-82.7 22.9) 107
POINT(-82.5 22.9) 107
POINT(-84.2 22.3) 106
...
POINT(-130.2 47.4) 1
POINT(-67.5 41.7) 1
POINT(-60.2 44.1) 1
POINT(-117.8 34.6) 1
POINT(-60.4 44.5) 1
Name: count, Length: 170855, dtype: int64
In [10]:
# Identify the top 20 locations with most days of lightning.
df.center_point_geom.value_counts()[:20].rename_axis('unique_values').reset_index(name='counts').style.background_gradient()
Out[10]:
| unique_values | counts | |
|---|---|---|
| 0 | POINT(-81.5 22.5) | 108 |
| 1 | POINT(-84.1 22.4) | 108 |
| 2 | POINT(-82.7 22.9) | 107 |
| 3 | POINT(-82.5 22.9) | 107 |
| 4 | POINT(-84.2 22.3) | 106 |
| 5 | POINT(-82.5 22.8) | 106 |
| 6 | POINT(-76 20.5) | 105 |
| 7 | POINT(-75.9 20.4) | 105 |
| 8 | POINT(-82.2 22.9) | 104 |
| 9 | POINT(-78 18.2) | 104 |
| 10 | POINT(-83.9 22.5) | 103 |
| 11 | POINT(-78 18.3) | 102 |
| 12 | POINT(-82 22.4) | 102 |
| 13 | POINT(-82 22.8) | 102 |
| 14 | POINT(-82.3 22.9) | 102 |
| 15 | POINT(-84 22.4) | 102 |
| 16 | POINT(-75.5 20.6) | 101 |
| 17 | POINT(-82 22.3) | 101 |
| 18 | POINT(-78.2 18.3) | 101 |
| 19 | POINT(-84 22.5) | 101 |
In [11]:
# Create two new columns.
df['week'] = df.date.dt.isocalendar().week
df['weekday'] = df.date.dt.day_name()
df.head()
Out[11]:
| date | number_of_strikes | center_point_geom | week | weekday | |
|---|---|---|---|---|---|
| 0 | 2018-01-03 | 194 | POINT(-75 27) | 1 | Wednesday |
| 1 | 2018-01-03 | 41 | POINT(-78.4 29) | 1 | Wednesday |
| 2 | 2018-01-03 | 33 | POINT(-73.9 27) | 1 | Wednesday |
| 3 | 2018-01-03 | 38 | POINT(-73.8 27) | 1 | Wednesday |
| 4 | 2018-01-03 | 92 | POINT(-79 28) | 1 | Wednesday |
In [12]:
# Calculate the mean count of lightning strikes for each weekday.
df[['weekday','number_of_strikes']].groupby(['weekday']).mean()
Out[12]:
| number_of_strikes | |
|---|---|
| weekday | |
| Friday | 13.349972 |
| Monday | 13.152804 |
| Saturday | 12.732694 |
| Sunday | 12.324717 |
| Thursday | 13.240594 |
| Tuesday | 13.813599 |
| Wednesday | 13.224568 |
In [13]:
# Define order of days for the plot.
weekday_order = ['Monday','Tuesday', 'Wednesday', 'Thursday','Friday','Saturday','Sunday']
In [14]:
# Create boxplots of strike counts for each day of week.
g = sns.boxplot(data=df,
x='weekday',
y='number_of_strikes',
order=weekday_order,
showfliers=False
);
g.set_title('Lightning distribution per weekday (2018)');
In [15]:
# Import 2016–2017 data
df_2 = pd.read_csv('eda_structuring_with_python_dataset2.csv')
df_2.head()
Out[15]:
| date | number_of_strikes | center_point_geom | |
|---|---|---|---|
| 0 | 2016-01-04 | 55 | POINT(-83.2 21.1) |
| 1 | 2016-01-04 | 33 | POINT(-83.1 21.1) |
| 2 | 2016-01-05 | 46 | POINT(-77.5 22.1) |
| 3 | 2016-01-05 | 28 | POINT(-76.8 22.3) |
| 4 | 2016-01-05 | 28 | POINT(-77 22.1) |
In [16]:
# Convert `date` column to datetime.
df_2['date'] = pd.to_datetime(df_2['date'])
In [17]:
# Create a new dataframe combining 2016–2017 data with 2018 data.
union_df = pd.concat([df.drop(['weekday','week'],axis=1), df_2], ignore_index=True)
union_df.head()
Out[17]:
| date | number_of_strikes | center_point_geom | |
|---|---|---|---|
| 0 | 2018-01-03 | 194 | POINT(-75 27) |
| 1 | 2018-01-03 | 41 | POINT(-78.4 29) |
| 2 | 2018-01-03 | 33 | POINT(-73.9 27) |
| 3 | 2018-01-03 | 38 | POINT(-73.8 27) |
| 4 | 2018-01-03 | 92 | POINT(-79 28) |
In [18]:
# Add 3 new columns.
union_df['year'] = union_df.date.dt.year
union_df['month'] = union_df.date.dt.month
union_df['month_txt'] = union_df.date.dt.month_name()
union_df.head()
Out[18]:
| date | number_of_strikes | center_point_geom | year | month | month_txt | |
|---|---|---|---|---|---|---|
| 0 | 2018-01-03 | 194 | POINT(-75 27) | 2018 | 1 | January |
| 1 | 2018-01-03 | 41 | POINT(-78.4 29) | 2018 | 1 | January |
| 2 | 2018-01-03 | 33 | POINT(-73.9 27) | 2018 | 1 | January |
| 3 | 2018-01-03 | 38 | POINT(-73.8 27) | 2018 | 1 | January |
| 4 | 2018-01-03 | 92 | POINT(-79 28) | 2018 | 1 | January |
In [19]:
# Calculate total number of strikes per year
union_df[['year','number_of_strikes']].groupby(['year']).sum()
Out[19]:
| number_of_strikes | |
|---|---|
| year | |
| 2016 | 41582229 |
| 2017 | 35095195 |
| 2018 | 44600989 |
In [21]:
lightning_by_month = union_df.groupby(['month_txt','year']).agg(
number_of_strikes = pd.NamedAgg(column='number_of_strikes', aggfunc="sum")
).reset_index()
lightning_by_month.head()
Out[21]:
| month_txt | year | number_of_strikes | |
|---|---|---|---|
| 0 | April | 2016 | 2636427 |
| 1 | April | 2017 | 3819075 |
| 2 | April | 2018 | 1524339 |
| 3 | August | 2016 | 7250442 |
| 4 | August | 2017 | 6021702 |
In [23]:
# Calculate total lightning strikes for each year.
lightning_by_year = union_df.groupby(['year']).agg(
year_strikes = pd.NamedAgg(column='number_of_strikes', aggfunc="sum")
).reset_index()
lightning_by_year.head()
Out[23]:
| year | year_strikes | |
|---|---|---|
| 0 | 2016 | 41582229 |
| 1 | 2017 | 35095195 |
| 2 | 2018 | 44600989 |
In [24]:
# Combine `lightning_by_month` and `lightning_by_year` dataframes into single dataframe.
percentage_lightning = lightning_by_month.merge(lightning_by_year,on='year')
percentage_lightning.head()
Out[24]:
| month_txt | year | number_of_strikes | year_strikes | |
|---|---|---|---|---|
| 0 | April | 2016 | 2636427 | 41582229 |
| 1 | April | 2017 | 3819075 | 35095195 |
| 2 | April | 2018 | 1524339 | 44600989 |
| 3 | August | 2016 | 7250442 | 41582229 |
| 4 | August | 2017 | 6021702 | 35095195 |
In [25]:
# Create new `percentage_lightning_per_month` column.
percentage_lightning['percentage_lightning_per_month'] = (percentage_lightning.number_of_strikes/
percentage_lightning.year_strikes * 100.0)
percentage_lightning.head()
Out[25]:
| month_txt | year | number_of_strikes | year_strikes | percentage_lightning_per_month | |
|---|---|---|---|---|---|
| 0 | April | 2016 | 2636427 | 41582229 | 6.340273 |
| 1 | April | 2017 | 3819075 | 35095195 | 10.882045 |
| 2 | April | 2018 | 1524339 | 44600989 | 3.417725 |
| 3 | August | 2016 | 7250442 | 41582229 | 17.436396 |
| 4 | August | 2017 | 6021702 | 35095195 | 17.158195 |
In [26]:
plt.figure(figsize=(10,6));
month_order = ['January', 'February', 'March', 'April', 'May', 'June',
'July', 'August', 'September', 'October', 'November', 'December']
sns.barplot(
data = percentage_lightning,
x = 'month_txt',
y = 'percentage_lightning_per_month',
hue = 'year',
order = month_order );
plt.xlabel("Month");
plt.ylabel("% of lightning strikes");
plt.title("% of lightning strikes each Month (2016-2018)");
In [ ]: