In [10]:
# Import statements
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
In [11]:
# Read in the data.
df = pd.read_csv('eda_manipulate_date_strings_with_python.csv')
df.head()
Out[11]:
| date | number_of_strikes | center_point_geom | |
|---|---|---|---|
| 0 | 2016-08-05 | 16 | POINT(-101.5 24.7) |
| 1 | 2016-08-05 | 16 | POINT(-85 34.3) |
| 2 | 2016-08-05 | 16 | POINT(-89 41.4) |
| 3 | 2016-08-05 | 16 | POINT(-89.8 30.7) |
| 4 | 2016-08-05 | 16 | POINT(-86.2 37.9) |
In [17]:
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 10479003 entries, 0 to 10479002 Data columns (total 7 columns): # Column Dtype --- ------ ----- 0 date datetime64[ns] 1 number_of_strikes int64 2 center_point_geom object 3 week object 4 month object 5 quarter object 6 year object dtypes: datetime64[ns](1), int64(1), object(5) memory usage: 559.6+ MB
In [18]:
df.describe
Out[18]:
<bound method NDFrame.describe of date number_of_strikes center_point_geom week month \
0 2016-08-05 16 POINT(-101.5 24.7) 2016-W31 2016-08
1 2016-08-05 16 POINT(-85 34.3) 2016-W31 2016-08
2 2016-08-05 16 POINT(-89 41.4) 2016-W31 2016-08
3 2016-08-05 16 POINT(-89.8 30.7) 2016-W31 2016-08
4 2016-08-05 16 POINT(-86.2 37.9) 2016-W31 2016-08
... ... ... ... ... ...
10478998 2018-05-27 13 POINT(-111.3 41) 2018-W21 2018-05
10478999 2018-05-27 13 POINT(-109.4 43.4) 2018-W21 2018-05
10479000 2018-05-27 13 POINT(-104.2 50) 2018-W21 2018-05
10479001 2018-05-27 13 POINT(-88.7 36.4) 2018-W21 2018-05
10479002 2018-05-27 13 POINT(-87.1 44.9) 2018-W21 2018-05
quarter year
0 2016-Q3 2016
1 2016-Q3 2016
2 2016-Q3 2016
3 2016-Q3 2016
4 2016-Q3 2016
... ... ...
10478998 2018-Q2 2018
10478999 2018-Q2 2018
10479000 2018-Q2 2018
10479001 2018-Q2 2018
10479002 2018-Q2 2018
[10479003 rows x 7 columns]>
In [19]:
# Convert the `date` column to datetime.
df['date'] = pd.to_datetime(df['date'])
In [20]:
##create four new columns: week, month, quarter, and year##
# Create four new columns.
df['week'] = df['date'].dt.strftime('%Y-W%V')
df['month'] = df['date'].dt.strftime('%Y-%m')
df['quarter'] = df['date'].dt.to_period('Q').dt.strftime('%Y-Q%q')
df['year'] = df['date'].dt.strftime('%Y')
In [21]:
df.head(10)
Out[21]:
| date | number_of_strikes | center_point_geom | week | month | quarter | year | |
|---|---|---|---|---|---|---|---|
| 0 | 2016-08-05 | 16 | POINT(-101.5 24.7) | 2016-W31 | 2016-08 | 2016-Q3 | 2016 |
| 1 | 2016-08-05 | 16 | POINT(-85 34.3) | 2016-W31 | 2016-08 | 2016-Q3 | 2016 |
| 2 | 2016-08-05 | 16 | POINT(-89 41.4) | 2016-W31 | 2016-08 | 2016-Q3 | 2016 |
| 3 | 2016-08-05 | 16 | POINT(-89.8 30.7) | 2016-W31 | 2016-08 | 2016-Q3 | 2016 |
| 4 | 2016-08-05 | 16 | POINT(-86.2 37.9) | 2016-W31 | 2016-08 | 2016-Q3 | 2016 |
| 5 | 2016-08-05 | 16 | POINT(-97.8 38.9) | 2016-W31 | 2016-08 | 2016-Q3 | 2016 |
| 6 | 2016-08-05 | 16 | POINT(-81.9 36) | 2016-W31 | 2016-08 | 2016-Q3 | 2016 |
| 7 | 2016-08-05 | 16 | POINT(-90.9 36.7) | 2016-W31 | 2016-08 | 2016-Q3 | 2016 |
| 8 | 2016-08-05 | 16 | POINT(-106.6 26.1) | 2016-W31 | 2016-08 | 2016-Q3 | 2016 |
| 9 | 2016-08-05 | 16 | POINT(-108 31.6) | 2016-W31 | 2016-08 | 2016-Q3 | 2016 |
In [24]:
# Create a new dataframe view of just 2018 data, summed by week.
# Create a new dataframe view of just 2018 data, summed by week
df_by_week_2018 = df[df['year'] == '2018'].groupby(['week']).sum(numeric_only=True).reset_index()
df_by_week_2018.head()
# NOTE: In pandas v.2.X+ you must set 'numeric_only=True' in the sum() function or else it will throw an error
Out[24]:
| week | number_of_strikes | |
|---|---|---|
| 0 | 2018-W01 | 34843 |
| 1 | 2018-W02 | 353425 |
| 2 | 2018-W03 | 37132 |
| 3 | 2018-W04 | 412772 |
| 4 | 2018-W05 | 34972 |
In [25]:
# Plot a bar graph of weekly strike totals in 2018.
plt.bar(x = df_by_week_2018['week'], height = df_by_week_2018['number_of_strikes'])
plt.plot()
plt.xlabel("Week number")
plt.ylabel("Number of lightning strikes")
plt.title("Number of lightning strikes per week (2018)");
In [26]:
plt.figure(figsize = (20, 5)) # Increase output size.
plt.bar(x = df_by_week_2018['week'], height = df_by_week_2018['number_of_strikes'])
plt.plot()
plt.xlabel("Week number")
plt.ylabel("Number of lightning strikes")
plt.title("Number of lightning strikes per week (2018)")
plt.xticks(rotation = 45, fontsize = 8) # Rotate x-axis labels and decrease font size.
plt.show()
In [27]:
df_by_quarter = df['number_of_strikes'].div(1000000)
df_by_quarter.head()
Out[27]:
0 0.000016 1 0.000016 2 0.000016 3 0.000016 4 0.000016 Name: number_of_strikes, dtype: float64
In [29]:
# Group 2016-2018 data by quarter and sum
df_by_quarter = df.groupby(['quarter']).sum(numeric_only=True).reset_index()
# Format as text, in millions
df_by_quarter['number_of_strikes_formatted'] = (
df_by_quarter['number_of_strikes']
.div(1000000)
.round(1)
.astype(str) + 'M'
)
df_by_quarter.head()
Out[29]:
| quarter | number_of_strikes | number_of_strikes_formatted | |
|---|---|---|---|
| 0 | 2016-Q1 | 2683798 | 2.7M |
| 1 | 2016-Q2 | 15084857 | 15.1M |
| 2 | 2016-Q3 | 21843820 | 21.8M |
| 3 | 2016-Q4 | 1969754 | 2.0M |
| 4 | 2017-Q1 | 2444279 | 2.4M |
In [30]:
def addlabels(x, y, labels):
'''
Iterates over data and plots text labels above each bar of bar graph.
'''
for i in range(len(x)):
plt.text(i, y[i], labels[i], ha = 'center', va = 'bottom')
In [31]:
plt.figure(figsize = (15, 5))
plt.bar(x = df_by_quarter['quarter'], height = df_by_quarter['number_of_strikes'])
addlabels(df_by_quarter['quarter'], df_by_quarter['number_of_strikes'], df_by_quarter['number_of_strikes_formatted'])
plt.plot()
plt.xlabel('Quarter')
plt.ylabel('Number of lightning strikes')
plt.title('Number of lightning strikes per quarter (2016-2018)')
plt.show()
In [32]:
# Create two new columns.
df_by_quarter['quarter_number'] = df_by_quarter['quarter'].str[-2:]
df_by_quarter['year'] = df_by_quarter['quarter'].str[:4]
df_by_quarter.head()
Out[32]:
| quarter | number_of_strikes | number_of_strikes_formatted | quarter_number | year | |
|---|---|---|---|---|---|
| 0 | 2016-Q1 | 2683798 | 2.7M | Q1 | 2016 |
| 1 | 2016-Q2 | 15084857 | 15.1M | Q2 | 2016 |
| 2 | 2016-Q3 | 21843820 | 21.8M | Q3 | 2016 |
| 3 | 2016-Q4 | 1969754 | 2.0M | Q4 | 2016 |
| 4 | 2017-Q1 | 2444279 | 2.4M | Q1 | 2017 |
In [33]:
plt.figure(figsize = (15, 5))
p = sns.barplot(
data = df_by_quarter,
x = 'quarter_number',
y = 'number_of_strikes',
hue = 'year')
for b in p.patches:
p.annotate(str(round(b.get_height()/1000000, 1))+'M',
(b.get_x() + b.get_width() / 2., b.get_height() + 1.2e6),
ha = 'center', va = 'bottom',
xytext = (0, -12),
textcoords = 'offset points')
plt.xlabel("Quarter")
plt.ylabel("Number of lightning strikes")
plt.title("Number of lightning strikes per quarter (2016-2018)")
plt.show()
In [ ]: