In [1]:
# Import statements
import pandas as pd
import numpy as np
import seaborn as sns
import datetime
from matplotlib import pyplot as plt
In [3]:
# Read in the 2018 data.
df = pd.read_csv('eda_structuring_with_python_dataset1.csv') 
df.head()
Out[3]:
date number_of_strikes center_point_geom
0 2018-01-03 194 POINT(-75 27)
1 2018-01-03 41 POINT(-78.4 29)
2 2018-01-03 33 POINT(-73.9 27)
3 2018-01-03 38 POINT(-73.8 27)
4 2018-01-03 92 POINT(-79 28)
In [4]:
# Convert the `date` column to datetime.
df['date'] = pd.to_datetime(df['date']) 
In [5]:
df.shape
Out[5]:
(3401012, 3)
In [7]:
##Check for duplicates##
df.drop_duplicates().shape
Out[7]:
(3401012, 3)
In [8]:
# Sort by number of strikes in descending order.
df.sort_values(by='number_of_strikes', ascending=False).head(10)
Out[8]:
date number_of_strikes center_point_geom
302758 2018-08-20 2211 POINT(-92.5 35.5)
278383 2018-08-16 2142 POINT(-96.1 36.1)
280830 2018-08-17 2061 POINT(-90.2 36.1)
280453 2018-08-17 2031 POINT(-89.9 35.9)
278382 2018-08-16 1902 POINT(-96.2 36.1)
11517 2018-02-10 1899 POINT(-95.5 28.1)
277506 2018-08-16 1878 POINT(-89.7 31.5)
24906 2018-02-25 1833 POINT(-98.7 28.9)
284320 2018-08-17 1767 POINT(-90.1 36)
24825 2018-02-25 1741 POINT(-98 29)
In [9]:
# Identify the locations that appear most in the dataset.
df.center_point_geom.value_counts()
Out[9]:
center_point_geom
POINT(-81.5 22.5)     108
POINT(-84.1 22.4)     108
POINT(-82.7 22.9)     107
POINT(-82.5 22.9)     107
POINT(-84.2 22.3)     106
                     ... 
POINT(-130.2 47.4)      1
POINT(-67.5 41.7)       1
POINT(-60.2 44.1)       1
POINT(-117.8 34.6)      1
POINT(-60.4 44.5)       1
Name: count, Length: 170855, dtype: int64
In [10]:
# Identify the top 20 locations with most days of lightning.
df.center_point_geom.value_counts()[:20].rename_axis('unique_values').reset_index(name='counts').style.background_gradient()
Out[10]:
  unique_values counts
0 POINT(-81.5 22.5) 108
1 POINT(-84.1 22.4) 108
2 POINT(-82.7 22.9) 107
3 POINT(-82.5 22.9) 107
4 POINT(-84.2 22.3) 106
5 POINT(-82.5 22.8) 106
6 POINT(-76 20.5) 105
7 POINT(-75.9 20.4) 105
8 POINT(-82.2 22.9) 104
9 POINT(-78 18.2) 104
10 POINT(-83.9 22.5) 103
11 POINT(-78 18.3) 102
12 POINT(-82 22.4) 102
13 POINT(-82 22.8) 102
14 POINT(-82.3 22.9) 102
15 POINT(-84 22.4) 102
16 POINT(-75.5 20.6) 101
17 POINT(-82 22.3) 101
18 POINT(-78.2 18.3) 101
19 POINT(-84 22.5) 101
In [11]:
# Create two new columns.
df['week'] = df.date.dt.isocalendar().week
df['weekday'] = df.date.dt.day_name()
df.head()
Out[11]:
date number_of_strikes center_point_geom week weekday
0 2018-01-03 194 POINT(-75 27) 1 Wednesday
1 2018-01-03 41 POINT(-78.4 29) 1 Wednesday
2 2018-01-03 33 POINT(-73.9 27) 1 Wednesday
3 2018-01-03 38 POINT(-73.8 27) 1 Wednesday
4 2018-01-03 92 POINT(-79 28) 1 Wednesday
In [12]:
# Calculate the mean count of lightning strikes for each weekday.
df[['weekday','number_of_strikes']].groupby(['weekday']).mean()
Out[12]:
number_of_strikes
weekday
Friday 13.349972
Monday 13.152804
Saturday 12.732694
Sunday 12.324717
Thursday 13.240594
Tuesday 13.813599
Wednesday 13.224568
In [13]:
# Define order of days for the plot.
weekday_order = ['Monday','Tuesday', 'Wednesday', 'Thursday','Friday','Saturday','Sunday']
In [14]:
# Create boxplots of strike counts for each day of week.
g = sns.boxplot(data=df, 
            x='weekday',
            y='number_of_strikes', 
            order=weekday_order, 
            showfliers=False 
            );
g.set_title('Lightning distribution per weekday (2018)');
No description has been provided for this image
In [15]:
# Import 2016–2017 data
df_2 = pd.read_csv('eda_structuring_with_python_dataset2.csv')
df_2.head()
Out[15]:
date number_of_strikes center_point_geom
0 2016-01-04 55 POINT(-83.2 21.1)
1 2016-01-04 33 POINT(-83.1 21.1)
2 2016-01-05 46 POINT(-77.5 22.1)
3 2016-01-05 28 POINT(-76.8 22.3)
4 2016-01-05 28 POINT(-77 22.1)
In [16]:
# Convert `date` column to datetime.
df_2['date'] = pd.to_datetime(df_2['date'])
In [17]:
# Create a new dataframe combining 2016–2017 data with 2018 data.
union_df = pd.concat([df.drop(['weekday','week'],axis=1), df_2], ignore_index=True)
union_df.head()
Out[17]:
date number_of_strikes center_point_geom
0 2018-01-03 194 POINT(-75 27)
1 2018-01-03 41 POINT(-78.4 29)
2 2018-01-03 33 POINT(-73.9 27)
3 2018-01-03 38 POINT(-73.8 27)
4 2018-01-03 92 POINT(-79 28)
In [18]:
# Add 3 new columns.
union_df['year'] = union_df.date.dt.year
union_df['month'] = union_df.date.dt.month
union_df['month_txt'] = union_df.date.dt.month_name()
union_df.head()
Out[18]:
date number_of_strikes center_point_geom year month month_txt
0 2018-01-03 194 POINT(-75 27) 2018 1 January
1 2018-01-03 41 POINT(-78.4 29) 2018 1 January
2 2018-01-03 33 POINT(-73.9 27) 2018 1 January
3 2018-01-03 38 POINT(-73.8 27) 2018 1 January
4 2018-01-03 92 POINT(-79 28) 2018 1 January
In [19]:
# Calculate total number of strikes per year
union_df[['year','number_of_strikes']].groupby(['year']).sum()
Out[19]:
number_of_strikes
year
2016 41582229
2017 35095195
2018 44600989
In [21]:
lightning_by_month = union_df.groupby(['month_txt','year']).agg(
    number_of_strikes = pd.NamedAgg(column='number_of_strikes', aggfunc="sum")
).reset_index()

lightning_by_month.head()
Out[21]:
month_txt year number_of_strikes
0 April 2016 2636427
1 April 2017 3819075
2 April 2018 1524339
3 August 2016 7250442
4 August 2017 6021702
In [23]:
# Calculate total lightning strikes for each year.
lightning_by_year = union_df.groupby(['year']).agg(
    year_strikes = pd.NamedAgg(column='number_of_strikes', aggfunc="sum")
).reset_index()

lightning_by_year.head()
Out[23]:
year year_strikes
0 2016 41582229
1 2017 35095195
2 2018 44600989
In [24]:
# Combine `lightning_by_month` and `lightning_by_year` dataframes into single dataframe.
percentage_lightning = lightning_by_month.merge(lightning_by_year,on='year')
percentage_lightning.head()
Out[24]:
month_txt year number_of_strikes year_strikes
0 April 2016 2636427 41582229
1 April 2017 3819075 35095195
2 April 2018 1524339 44600989
3 August 2016 7250442 41582229
4 August 2017 6021702 35095195
In [25]:
# Create new `percentage_lightning_per_month` column.
percentage_lightning['percentage_lightning_per_month'] = (percentage_lightning.number_of_strikes/
                                                          percentage_lightning.year_strikes * 100.0)
percentage_lightning.head()
Out[25]:
month_txt year number_of_strikes year_strikes percentage_lightning_per_month
0 April 2016 2636427 41582229 6.340273
1 April 2017 3819075 35095195 10.882045
2 April 2018 1524339 44600989 3.417725
3 August 2016 7250442 41582229 17.436396
4 August 2017 6021702 35095195 17.158195
In [26]:
plt.figure(figsize=(10,6));

month_order = ['January', 'February', 'March', 'April', 'May', 'June', 
               'July', 'August', 'September', 'October', 'November', 'December']

sns.barplot(
    data = percentage_lightning,
    x = 'month_txt',
    y = 'percentage_lightning_per_month',
    hue = 'year',
    order = month_order );
plt.xlabel("Month");
plt.ylabel("% of lightning strikes");
plt.title("% of lightning strikes each Month (2016-2018)");
No description has been provided for this image
In [ ]: