# Import statements
import pandas as pd
import numpy as np
import seaborn as sns
import datetime
from matplotlib import pyplot as plt

# Read in the 2018 data.
df = pd.read_csv('eda_structuring_with_python_dataset1.csv') 
df.head()

# Convert the `date` column to datetime.
df['date'] = pd.to_datetime(df['date'])

df.shape

(3401012, 3)

##Check for duplicates##
df.drop_duplicates().shape

(3401012, 3)

# Sort by number of strikes in descending order.
df.sort_values(by='number_of_strikes', ascending=False).head(10)

# Identify the locations that appear most in the dataset.
df.center_point_geom.value_counts()

center_point_geom
POINT(-81.5 22.5)     108
POINT(-84.1 22.4)     108
POINT(-82.7 22.9)     107
POINT(-82.5 22.9)     107
POINT(-84.2 22.3)     106
                     ... 
POINT(-130.2 47.4)      1
POINT(-67.5 41.7)       1
POINT(-60.2 44.1)       1
POINT(-117.8 34.6)      1
POINT(-60.4 44.5)       1
Name: count, Length: 170855, dtype: int64

# Identify the top 20 locations with most days of lightning.
df.center_point_geom.value_counts()[:20].rename_axis('unique_values').reset_index(name='counts').style.background_gradient()

# Create two new columns.
df['week'] = df.date.dt.isocalendar().week
df['weekday'] = df.date.dt.day_name()
df.head()

# Calculate the mean count of lightning strikes for each weekday.
df[['weekday','number_of_strikes']].groupby(['weekday']).mean()

# Define order of days for the plot.
weekday_order = ['Monday','Tuesday', 'Wednesday', 'Thursday','Friday','Saturday','Sunday']

# Create boxplots of strike counts for each day of week.
g = sns.boxplot(data=df, 
            x='weekday',
            y='number_of_strikes', 
            order=weekday_order, 
            showfliers=False 
            );
g.set_title('Lightning distribution per weekday (2018)');

# Import 2016–2017 data
df_2 = pd.read_csv('eda_structuring_with_python_dataset2.csv')
df_2.head()

# Convert `date` column to datetime.
df_2['date'] = pd.to_datetime(df_2['date'])

# Create a new dataframe combining 2016–2017 data with 2018 data.
union_df = pd.concat([df.drop(['weekday','week'],axis=1), df_2], ignore_index=True)
union_df.head()

# Add 3 new columns.
union_df['year'] = union_df.date.dt.year
union_df['month'] = union_df.date.dt.month
union_df['month_txt'] = union_df.date.dt.month_name()
union_df.head()

# Calculate total number of strikes per year
union_df[['year','number_of_strikes']].groupby(['year']).sum()

lightning_by_month = union_df.groupby(['month_txt','year']).agg(
    number_of_strikes = pd.NamedAgg(column='number_of_strikes', aggfunc="sum")
).reset_index()

lightning_by_month.head()

# Calculate total lightning strikes for each year.
lightning_by_year = union_df.groupby(['year']).agg(
    year_strikes = pd.NamedAgg(column='number_of_strikes', aggfunc="sum")
).reset_index()

lightning_by_year.head()

# Combine `lightning_by_month` and `lightning_by_year` dataframes into single dataframe.
percentage_lightning = lightning_by_month.merge(lightning_by_year,on='year')
percentage_lightning.head()

# Create new `percentage_lightning_per_month` column.
percentage_lightning['percentage_lightning_per_month'] = (percentage_lightning.number_of_strikes/
                                                          percentage_lightning.year_strikes * 100.0)
percentage_lightning.head()

plt.figure(figsize=(10,6));

month_order = ['January', 'February', 'March', 'April', 'May', 'June', 
               'July', 'August', 'September', 'October', 'November', 'December']

sns.barplot(
    data = percentage_lightning,
    x = 'month_txt',
    y = 'percentage_lightning_per_month',
    hue = 'year',
    order = month_order );
plt.xlabel("Month");
plt.ylabel("% of lightning strikes");
plt.title("% of lightning strikes each Month (2016-2018)");

	date	number_of_strikes	center_point_geom
0	2018-01-03	194	POINT(-75 27)
1	2018-01-03	41	POINT(-78.4 29)
2	2018-01-03	33	POINT(-73.9 27)
3	2018-01-03	38	POINT(-73.8 27)
4	2018-01-03	92	POINT(-79 28)

	date	number_of_strikes	center_point_geom
302758	2018-08-20	2211	POINT(-92.5 35.5)
278383	2018-08-16	2142	POINT(-96.1 36.1)
280830	2018-08-17	2061	POINT(-90.2 36.1)
280453	2018-08-17	2031	POINT(-89.9 35.9)
278382	2018-08-16	1902	POINT(-96.2 36.1)
11517	2018-02-10	1899	POINT(-95.5 28.1)
277506	2018-08-16	1878	POINT(-89.7 31.5)
24906	2018-02-25	1833	POINT(-98.7 28.9)
284320	2018-08-17	1767	POINT(-90.1 36)
24825	2018-02-25	1741	POINT(-98 29)

	unique_values	counts
0	POINT(-81.5 22.5)	108
1	POINT(-84.1 22.4)	108
2	POINT(-82.7 22.9)	107
3	POINT(-82.5 22.9)	107
4	POINT(-84.2 22.3)	106
5	POINT(-82.5 22.8)	106
6	POINT(-76 20.5)	105
7	POINT(-75.9 20.4)	105
8	POINT(-82.2 22.9)	104
9	POINT(-78 18.2)	104
10	POINT(-83.9 22.5)	103
11	POINT(-78 18.3)	102
12	POINT(-82 22.4)	102
13	POINT(-82 22.8)	102
14	POINT(-82.3 22.9)	102
15	POINT(-84 22.4)	102
16	POINT(-75.5 20.6)	101
17	POINT(-82 22.3)	101
18	POINT(-78.2 18.3)	101
19	POINT(-84 22.5)	101

	date	number_of_strikes	center_point_geom	week	weekday
0	2018-01-03	194	POINT(-75 27)	1	Wednesday
1	2018-01-03	41	POINT(-78.4 29)	1	Wednesday
2	2018-01-03	33	POINT(-73.9 27)	1	Wednesday
3	2018-01-03	38	POINT(-73.8 27)	1	Wednesday
4	2018-01-03	92	POINT(-79 28)	1	Wednesday

	date	number_of_strikes	center_point_geom
0	2016-01-04	55	POINT(-83.2 21.1)
1	2016-01-04	33	POINT(-83.1 21.1)
2	2016-01-05	46	POINT(-77.5 22.1)
3	2016-01-05	28	POINT(-76.8 22.3)
4	2016-01-05	28	POINT(-77 22.1)

	number_of_strikes
weekday
Friday	13.349972
Monday	13.152804
Saturday	12.732694
Sunday	12.324717
Thursday	13.240594
Tuesday	13.813599
Wednesday	13.224568

	date	number_of_strikes	center_point_geom	year	month	month_txt
0	2018-01-03	194	POINT(-75 27)	2018	1	January
1	2018-01-03	41	POINT(-78.4 29)	2018	1	January
2	2018-01-03	33	POINT(-73.9 27)	2018	1	January
3	2018-01-03	38	POINT(-73.8 27)	2018	1	January
4	2018-01-03	92	POINT(-79 28)	2018	1	January

	month_txt	year	number_of_strikes
0	April	2016	2636427
1	April	2017	3819075
2	April	2018	1524339
3	August	2016	7250442
4	August	2017	6021702