In [10]:
# Import statements
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
In [11]:
# Read in the data.
df = pd.read_csv('eda_manipulate_date_strings_with_python.csv')
df.head()
Out[11]:
date number_of_strikes center_point_geom
0 2016-08-05 16 POINT(-101.5 24.7)
1 2016-08-05 16 POINT(-85 34.3)
2 2016-08-05 16 POINT(-89 41.4)
3 2016-08-05 16 POINT(-89.8 30.7)
4 2016-08-05 16 POINT(-86.2 37.9)
In [17]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10479003 entries, 0 to 10479002
Data columns (total 7 columns):
 #   Column             Dtype         
---  ------             -----         
 0   date               datetime64[ns]
 1   number_of_strikes  int64         
 2   center_point_geom  object        
 3   week               object        
 4   month              object        
 5   quarter            object        
 6   year               object        
dtypes: datetime64[ns](1), int64(1), object(5)
memory usage: 559.6+ MB
In [18]:
df.describe
Out[18]:
<bound method NDFrame.describe of                date  number_of_strikes   center_point_geom      week    month  \
0        2016-08-05                 16  POINT(-101.5 24.7)  2016-W31  2016-08   
1        2016-08-05                 16     POINT(-85 34.3)  2016-W31  2016-08   
2        2016-08-05                 16     POINT(-89 41.4)  2016-W31  2016-08   
3        2016-08-05                 16   POINT(-89.8 30.7)  2016-W31  2016-08   
4        2016-08-05                 16   POINT(-86.2 37.9)  2016-W31  2016-08   
...             ...                ...                 ...       ...      ...   
10478998 2018-05-27                 13    POINT(-111.3 41)  2018-W21  2018-05   
10478999 2018-05-27                 13  POINT(-109.4 43.4)  2018-W21  2018-05   
10479000 2018-05-27                 13    POINT(-104.2 50)  2018-W21  2018-05   
10479001 2018-05-27                 13   POINT(-88.7 36.4)  2018-W21  2018-05   
10479002 2018-05-27                 13   POINT(-87.1 44.9)  2018-W21  2018-05   

          quarter  year  
0         2016-Q3  2016  
1         2016-Q3  2016  
2         2016-Q3  2016  
3         2016-Q3  2016  
4         2016-Q3  2016  
...           ...   ...  
10478998  2018-Q2  2018  
10478999  2018-Q2  2018  
10479000  2018-Q2  2018  
10479001  2018-Q2  2018  
10479002  2018-Q2  2018  

[10479003 rows x 7 columns]>
In [19]:
# Convert the `date` column to datetime.
df['date'] = pd.to_datetime(df['date'])
In [20]:
##create four new columns: week, month, quarter, and year##

# Create four new columns.
df['week'] = df['date'].dt.strftime('%Y-W%V')
df['month'] = df['date'].dt.strftime('%Y-%m')
df['quarter'] = df['date'].dt.to_period('Q').dt.strftime('%Y-Q%q')
df['year'] = df['date'].dt.strftime('%Y')
In [21]:
df.head(10)
Out[21]:
date number_of_strikes center_point_geom week month quarter year
0 2016-08-05 16 POINT(-101.5 24.7) 2016-W31 2016-08 2016-Q3 2016
1 2016-08-05 16 POINT(-85 34.3) 2016-W31 2016-08 2016-Q3 2016
2 2016-08-05 16 POINT(-89 41.4) 2016-W31 2016-08 2016-Q3 2016
3 2016-08-05 16 POINT(-89.8 30.7) 2016-W31 2016-08 2016-Q3 2016
4 2016-08-05 16 POINT(-86.2 37.9) 2016-W31 2016-08 2016-Q3 2016
5 2016-08-05 16 POINT(-97.8 38.9) 2016-W31 2016-08 2016-Q3 2016
6 2016-08-05 16 POINT(-81.9 36) 2016-W31 2016-08 2016-Q3 2016
7 2016-08-05 16 POINT(-90.9 36.7) 2016-W31 2016-08 2016-Q3 2016
8 2016-08-05 16 POINT(-106.6 26.1) 2016-W31 2016-08 2016-Q3 2016
9 2016-08-05 16 POINT(-108 31.6) 2016-W31 2016-08 2016-Q3 2016
In [24]:
# Create a new dataframe view of just 2018 data, summed by week.
# Create a new dataframe view of just 2018 data, summed by week
df_by_week_2018 = df[df['year'] == '2018'].groupby(['week']).sum(numeric_only=True).reset_index()
df_by_week_2018.head()


# NOTE: In pandas v.2.X+ you must set 'numeric_only=True' in the sum() function or else it will throw an error
Out[24]:
week number_of_strikes
0 2018-W01 34843
1 2018-W02 353425
2 2018-W03 37132
3 2018-W04 412772
4 2018-W05 34972
In [25]:
# Plot a bar graph of weekly strike totals in 2018.
plt.bar(x = df_by_week_2018['week'], height = df_by_week_2018['number_of_strikes'])
plt.plot()
plt.xlabel("Week number")
plt.ylabel("Number of lightning strikes")
plt.title("Number of lightning strikes per week (2018)");
No description has been provided for this image
In [26]:
plt.figure(figsize = (20, 5)) # Increase output size.
plt.bar(x = df_by_week_2018['week'], height = df_by_week_2018['number_of_strikes'])
plt.plot()
plt.xlabel("Week number")
plt.ylabel("Number of lightning strikes")
plt.title("Number of lightning strikes per week (2018)")
plt.xticks(rotation = 45, fontsize = 8) # Rotate x-axis labels and decrease font size.

plt.show()
No description has been provided for this image
In [27]:
df_by_quarter = df['number_of_strikes'].div(1000000)
df_by_quarter.head()
Out[27]:
0    0.000016
1    0.000016
2    0.000016
3    0.000016
4    0.000016
Name: number_of_strikes, dtype: float64
In [29]:
# Group 2016-2018 data by quarter and sum
df_by_quarter = df.groupby(['quarter']).sum(numeric_only=True).reset_index()

# Format as text, in millions
df_by_quarter['number_of_strikes_formatted'] = (
    df_by_quarter['number_of_strikes']
    .div(1000000)
    .round(1)
    .astype(str) + 'M'
)

df_by_quarter.head()
Out[29]:
quarter number_of_strikes number_of_strikes_formatted
0 2016-Q1 2683798 2.7M
1 2016-Q2 15084857 15.1M
2 2016-Q3 21843820 21.8M
3 2016-Q4 1969754 2.0M
4 2017-Q1 2444279 2.4M
In [30]:
def addlabels(x, y, labels):
    '''
    Iterates over data and plots text labels above each bar of bar graph.
    '''
    for i in range(len(x)):
        plt.text(i, y[i], labels[i], ha = 'center', va = 'bottom')
In [31]:
plt.figure(figsize = (15, 5))
plt.bar(x = df_by_quarter['quarter'], height = df_by_quarter['number_of_strikes'])
addlabels(df_by_quarter['quarter'], df_by_quarter['number_of_strikes'], df_by_quarter['number_of_strikes_formatted'])
plt.plot()
plt.xlabel('Quarter')
plt.ylabel('Number of lightning strikes')
plt.title('Number of lightning strikes per quarter (2016-2018)')
plt.show()
No description has been provided for this image
In [32]:
# Create two new columns.
df_by_quarter['quarter_number'] = df_by_quarter['quarter'].str[-2:]
df_by_quarter['year'] = df_by_quarter['quarter'].str[:4]
df_by_quarter.head()
Out[32]:
quarter number_of_strikes number_of_strikes_formatted quarter_number year
0 2016-Q1 2683798 2.7M Q1 2016
1 2016-Q2 15084857 15.1M Q2 2016
2 2016-Q3 21843820 21.8M Q3 2016
3 2016-Q4 1969754 2.0M Q4 2016
4 2017-Q1 2444279 2.4M Q1 2017
In [33]:
plt.figure(figsize = (15, 5))
p = sns.barplot(
    data = df_by_quarter,
    x = 'quarter_number',
    y = 'number_of_strikes',
    hue = 'year')
for b in p.patches:
    p.annotate(str(round(b.get_height()/1000000, 1))+'M', 
                   (b.get_x() + b.get_width() / 2., b.get_height() + 1.2e6), 
                   ha = 'center', va = 'bottom', 
                   xytext = (0, -12), 
                   textcoords = 'offset points')
plt.xlabel("Quarter")
plt.ylabel("Number of lightning strikes")
plt.title("Number of lightning strikes per quarter (2016-2018)")
plt.show()
No description has been provided for this image
In [ ]: