In [5]:
import pandas as pd
In [3]:
import pandas as pd


df = pd.read_csv("waze_dataset.csv")
In [7]:
df.head()
df.shape
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14999 entries, 0 to 14998
Data columns (total 13 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   ID                       14999 non-null  int64  
 1   label                    14299 non-null  object 
 2   sessions                 14999 non-null  int64  
 3   drives                   14999 non-null  int64  
 4   total_sessions           14999 non-null  float64
 5   n_days_after_onboarding  14999 non-null  int64  
 6   total_navigations_fav1   14999 non-null  int64  
 7   total_navigations_fav2   14999 non-null  int64  
 8   driven_km_drives         14999 non-null  float64
 9   duration_minutes_drives  14999 non-null  float64
 10  activity_days            14999 non-null  int64  
 11  driving_days             14999 non-null  int64  
 12  device                   14999 non-null  object 
dtypes: float64(3), int64(8), object(2)
memory usage: 1.5+ MB
In [8]:
df.head(10)
Out[8]:
ID label sessions drives total_sessions n_days_after_onboarding total_navigations_fav1 total_navigations_fav2 driven_km_drives duration_minutes_drives activity_days driving_days device
0 0 retained 283 226 296.748273 2276 208 0 2628.845068 1985.775061 28 19 Android
1 1 retained 133 107 326.896596 1225 19 64 13715.920550 3160.472914 13 11 iPhone
2 2 retained 114 95 135.522926 2651 0 0 3059.148818 1610.735904 14 8 Android
3 3 retained 49 40 67.589221 15 322 7 913.591123 587.196542 7 3 iPhone
4 4 retained 84 68 168.247020 1562 166 5 3950.202008 1219.555924 27 18 Android
5 5 retained 113 103 279.544437 2637 0 0 901.238699 439.101397 15 11 iPhone
6 6 retained 3 2 236.725314 360 185 18 5249.172828 726.577205 28 23 iPhone
7 7 retained 39 35 176.072845 2999 0 0 7892.052468 2466.981741 22 20 iPhone
8 8 retained 57 46 183.532018 424 0 26 2651.709764 1594.342984 25 20 Android
9 9 churned 84 68 244.802115 2997 72 0 6043.460295 2341.838528 7 3 iPhone
In [9]:
df.isna().sum()
Out[9]:
ID                           0
label                      700
sessions                     0
drives                       0
total_sessions               0
n_days_after_onboarding      0
total_navigations_fav1       0
total_navigations_fav2       0
driven_km_drives             0
duration_minutes_drives      0
activity_days                0
driving_days                 0
device                       0
dtype: int64
In [10]:
# Number of non-missing labels
df['label'].notna().sum()
Out[10]:
np.int64(14299)
In [12]:
#labell distributtion #
df['label'].value_counts()
Out[12]:
label
retained    11763
churned      2536
Name: count, dtype: int64
In [13]:
# Rows with missing label
missing_label = df[df['label'].isna()]

# Rows with label present
label_present = df[df['label'].notna()]
In [14]:
# Summary for rows with missing label
missing_label.describe()
Out[14]:
ID sessions drives total_sessions n_days_after_onboarding total_navigations_fav1 total_navigations_fav2 driven_km_drives duration_minutes_drives activity_days driving_days
count 700.000000 700.000000 700.000000 700.000000 700.000000 700.000000 700.000000 700.000000 700.000000 700.000000 700.000000
mean 7405.584286 80.837143 67.798571 198.483348 1709.295714 118.717143 30.371429 3935.967029 1795.123358 15.382857 12.125714
std 4306.900234 79.987440 65.271926 140.561715 1005.306562 156.308140 46.306984 2443.107121 1419.242246 8.772714 7.626373
min 77.000000 0.000000 0.000000 5.582648 16.000000 0.000000 0.000000 290.119811 66.588493 0.000000 0.000000
25% 3744.500000 23.000000 20.000000 94.056340 869.000000 4.000000 0.000000 2119.344818 779.009271 8.000000 6.000000
50% 7443.000000 56.000000 47.500000 177.255925 1650.500000 62.500000 10.000000 3421.156721 1414.966279 15.000000 12.000000
75% 11007.000000 112.250000 94.000000 266.058022 2508.750000 169.250000 43.000000 5166.097373 2443.955404 23.000000 18.000000
max 14993.000000 556.000000 445.000000 1076.879741 3498.000000 1096.000000 352.000000 15135.391280 9746.253023 31.000000 30.000000
In [15]:
# Summary for rows with label present
label_present.describe()
Out[15]:
ID sessions drives total_sessions n_days_after_onboarding total_navigations_fav1 total_navigations_fav2 driven_km_drives duration_minutes_drives activity_days driving_days
count 14299.000000 14299.000000 14299.000000 14299.000000 14299.000000 14299.000000 14299.000000 14299.000000 14299.000000 14299.000000 14299.000000
mean 7503.573117 80.623820 67.255822 189.547409 1751.822505 121.747395 29.638296 4044.401535 1864.199794 15.544653 12.182530
std 4331.207621 80.736502 65.947295 136.189764 1008.663834 147.713428 45.350890 2504.977970 1448.005047 9.016088 7.833835
min 0.000000 0.000000 0.000000 0.220211 4.000000 0.000000 0.000000 60.441250 18.282082 0.000000 0.000000
25% 3749.500000 23.000000 20.000000 90.457733 878.500000 10.000000 0.000000 2217.319909 840.181344 8.000000 5.000000
50% 7504.000000 56.000000 48.000000 158.718571 1749.000000 71.000000 9.000000 3496.545617 1479.394387 16.000000 12.000000
75% 11257.500000 111.000000 93.000000 253.540450 2627.500000 178.000000 43.000000 5299.972162 2466.928876 23.000000 19.000000
max 14998.000000 743.000000 596.000000 1216.154633 3500.000000 1236.000000 415.000000 21183.401890 15851.727160 31.000000 30.000000
In [16]:
summary_comparison = pd.concat([missing_label.describe(), label_present.describe()], axis=1, keys=['Missing Label', 'Label Present'])
summary_comparison
Out[16]:
Missing Label ... Label Present
ID sessions drives total_sessions n_days_after_onboarding total_navigations_fav1 total_navigations_fav2 driven_km_drives duration_minutes_drives activity_days ... sessions drives total_sessions n_days_after_onboarding total_navigations_fav1 total_navigations_fav2 driven_km_drives duration_minutes_drives activity_days driving_days
count 700.000000 700.000000 700.000000 700.000000 700.000000 700.000000 700.000000 700.000000 700.000000 700.000000 ... 14299.000000 14299.000000 14299.000000 14299.000000 14299.000000 14299.000000 14299.000000 14299.000000 14299.000000 14299.000000
mean 7405.584286 80.837143 67.798571 198.483348 1709.295714 118.717143 30.371429 3935.967029 1795.123358 15.382857 ... 80.623820 67.255822 189.547409 1751.822505 121.747395 29.638296 4044.401535 1864.199794 15.544653 12.182530
std 4306.900234 79.987440 65.271926 140.561715 1005.306562 156.308140 46.306984 2443.107121 1419.242246 8.772714 ... 80.736502 65.947295 136.189764 1008.663834 147.713428 45.350890 2504.977970 1448.005047 9.016088 7.833835
min 77.000000 0.000000 0.000000 5.582648 16.000000 0.000000 0.000000 290.119811 66.588493 0.000000 ... 0.000000 0.000000 0.220211 4.000000 0.000000 0.000000 60.441250 18.282082 0.000000 0.000000
25% 3744.500000 23.000000 20.000000 94.056340 869.000000 4.000000 0.000000 2119.344818 779.009271 8.000000 ... 23.000000 20.000000 90.457733 878.500000 10.000000 0.000000 2217.319909 840.181344 8.000000 5.000000
50% 7443.000000 56.000000 47.500000 177.255925 1650.500000 62.500000 10.000000 3421.156721 1414.966279 15.000000 ... 56.000000 48.000000 158.718571 1749.000000 71.000000 9.000000 3496.545617 1479.394387 16.000000 12.000000
75% 11007.000000 112.250000 94.000000 266.058022 2508.750000 169.250000 43.000000 5166.097373 2443.955404 23.000000 ... 111.000000 93.000000 253.540450 2627.500000 178.000000 43.000000 5299.972162 2466.928876 23.000000 19.000000
max 14993.000000 556.000000 445.000000 1076.879741 3498.000000 1096.000000 352.000000 15135.391280 9746.253023 31.000000 ... 743.000000 596.000000 1216.154633 3500.000000 1236.000000 415.000000 21183.401890 15851.727160 31.000000 30.000000

8 rows × 22 columns

In [17]:
# Isolate rows where 'label' is missing
missing_label_rows = df[df['label'].isna()]

# View the first few rows
missing_label_rows.head()
Out[17]:
ID label sessions drives total_sessions n_days_after_onboarding total_navigations_fav1 total_navigations_fav2 driven_km_drives duration_minutes_drives activity_days driving_days device
77 77 NaN 63 50 133.104155 783 201 0 2649.015822 1517.209970 19 13 iPhone
80 80 NaN 116 93 436.060183 1584 283 62 4183.409514 3121.889952 18 15 iPhone
98 98 NaN 78 64 583.492789 3414 0 0 1811.140893 642.189122 12 11 Android
111 111 NaN 106 102 113.379056 2228 14 0 2817.481840 2011.724274 17 13 Android
142 142 NaN 32 26 222.129310 208 55 10 2459.816477 874.427617 11 7 iPhone
In [18]:
# Isolate rows where 'label' is NOT missing
label_present_rows = df[df['label'].notna()]

# Display summary statistics for numeric columns
label_present_rows.describe()
Out[18]:
ID sessions drives total_sessions n_days_after_onboarding total_navigations_fav1 total_navigations_fav2 driven_km_drives duration_minutes_drives activity_days driving_days
count 14299.000000 14299.000000 14299.000000 14299.000000 14299.000000 14299.000000 14299.000000 14299.000000 14299.000000 14299.000000 14299.000000
mean 7503.573117 80.623820 67.255822 189.547409 1751.822505 121.747395 29.638296 4044.401535 1864.199794 15.544653 12.182530
std 4331.207621 80.736502 65.947295 136.189764 1008.663834 147.713428 45.350890 2504.977970 1448.005047 9.016088 7.833835
min 0.000000 0.000000 0.000000 0.220211 4.000000 0.000000 0.000000 60.441250 18.282082 0.000000 0.000000
25% 3749.500000 23.000000 20.000000 90.457733 878.500000 10.000000 0.000000 2217.319909 840.181344 8.000000 5.000000
50% 7504.000000 56.000000 48.000000 158.718571 1749.000000 71.000000 9.000000 3496.545617 1479.394387 16.000000 12.000000
75% 11257.500000 111.000000 93.000000 253.540450 2627.500000 178.000000 43.000000 5299.972162 2466.928876 23.000000 19.000000
max 14998.000000 743.000000 596.000000 1216.154633 3500.000000 1236.000000 415.000000 21183.401890 15851.727160 31.000000 30.000000
In [23]:
#Rows with missing labels#
df[df['label'].isna()].describe()
Out[23]:
ID sessions drives total_sessions n_days_after_onboarding total_navigations_fav1 total_navigations_fav2 driven_km_drives duration_minutes_drives activity_days driving_days
count 700.000000 700.000000 700.000000 700.000000 700.000000 700.000000 700.000000 700.000000 700.000000 700.000000 700.000000
mean 7405.584286 80.837143 67.798571 198.483348 1709.295714 118.717143 30.371429 3935.967029 1795.123358 15.382857 12.125714
std 4306.900234 79.987440 65.271926 140.561715 1005.306562 156.308140 46.306984 2443.107121 1419.242246 8.772714 7.626373
min 77.000000 0.000000 0.000000 5.582648 16.000000 0.000000 0.000000 290.119811 66.588493 0.000000 0.000000
25% 3744.500000 23.000000 20.000000 94.056340 869.000000 4.000000 0.000000 2119.344818 779.009271 8.000000 6.000000
50% 7443.000000 56.000000 47.500000 177.255925 1650.500000 62.500000 10.000000 3421.156721 1414.966279 15.000000 12.000000
75% 11007.000000 112.250000 94.000000 266.058022 2508.750000 169.250000 43.000000 5166.097373 2443.955404 23.000000 18.000000
max 14993.000000 556.000000 445.000000 1076.879741 3498.000000 1096.000000 352.000000 15135.391280 9746.253023 31.000000 30.000000
In [22]:
#Rows with labels present#
df[df['label'].notna()].describe()
Out[22]:
ID sessions drives total_sessions n_days_after_onboarding total_navigations_fav1 total_navigations_fav2 driven_km_drives duration_minutes_drives activity_days driving_days
count 14299.000000 14299.000000 14299.000000 14299.000000 14299.000000 14299.000000 14299.000000 14299.000000 14299.000000 14299.000000 14299.000000
mean 7503.573117 80.623820 67.255822 189.547409 1751.822505 121.747395 29.638296 4044.401535 1864.199794 15.544653 12.182530
std 4331.207621 80.736502 65.947295 136.189764 1008.663834 147.713428 45.350890 2504.977970 1448.005047 9.016088 7.833835
min 0.000000 0.000000 0.000000 0.220211 4.000000 0.000000 0.000000 60.441250 18.282082 0.000000 0.000000
25% 3749.500000 23.000000 20.000000 90.457733 878.500000 10.000000 0.000000 2217.319909 840.181344 8.000000 5.000000
50% 7504.000000 56.000000 48.000000 158.718571 1749.000000 71.000000 9.000000 3496.545617 1479.394387 16.000000 12.000000
75% 11257.500000 111.000000 93.000000 253.540450 2627.500000 178.000000 43.000000 5299.972162 2466.928876 23.000000 19.000000
max 14998.000000 743.000000 596.000000 1216.154633 3500.000000 1236.000000 415.000000 21183.401890 15851.727160 31.000000 30.000000
In [25]:
#Observations The mean and median are very close between the two groups (100 difference on a scale of 7500). This suggests no strong difference in central tendency. Min & Max -Both populations cover almost the full range of IDs (0–14998), so extremes are similar. Standard Deviation Spread is almost identical (~4300), meaning variability is very similar. Conclusion Based on these summary statistics, there is no discernible difference between the rows with missing labels and those with labels present. The populations appear to be roughly similar in terms of numeric distribution.#
In [28]:
# Count device types where label is missing #
df[df['label'].isna()]['device'].value_counts()
Out[28]:
device
iPhone     447
Android    253
Name: count, dtype: int64
In [29]:
# Count device types where label is missing #
df[df['label'].isna()].groupby('device').size()
Out[29]:
device
Android    253
iPhone     447
dtype: int64
In [30]:
# Percentage of each device among rows with missing labels#

df[df['label'].isna()]['device'].value_counts(normalize=True) * 100
Out[30]:
device
iPhone     63.857143
Android    36.142857
Name: proportion, dtype: float64
In [31]:
# Device ratio in the full dataset##

df['device'].value_counts(normalize=True) * 100
Out[31]:
device
iPhone     64.484299
Android    35.515701
Name: proportion, dtype: float64
In [32]:
# Calculate counts of churned vs. retained
### YOUR CODE HERE ###

df['label'].value_counts()
df['label'].value_counts(normalize=True) * 100
Out[32]:
label
retained    82.264494
churned     17.735506
Name: proportion, dtype: float64
In [33]:
# Median values for each variable by label #
df.groupby('label').median(numeric_only=True)
Out[33]:
ID sessions drives total_sessions n_days_after_onboarding total_navigations_fav1 total_navigations_fav2 driven_km_drives duration_minutes_drives activity_days driving_days
label
churned 7477.5 59.0 50.0 164.339042 1321.0 84.5 11.0 3652.655666 1607.183785 8.0 6.0
retained 7509.0 56.0 47.0 157.586756 1843.0 68.0 9.0 3464.684614 1458.046141 17.0 14.0
In [34]:
# Add a column to df called `km_per_drive`
### YOUR CODE HERE ###

df['km_per_drive'] = df['driven_km_drives'] / df['drives']

# Group by `label`, calculate the median, and isolate for km per drive
### YOUR CODE HERE ###

df.groupby('label')['km_per_drive'].median()
Out[34]:
label
churned     74.109416
retained    75.014702
Name: km_per_drive, dtype: float64
In [35]:
# Add a column to df called `km_per_driving_day`
### YOUR CODE HERE ###

df['km_per_driving_day'] = df['driven_km_drives'] / df['driving_days']


# Group by `label`, calculate the median, and isolate for km per driving day
### YOUR CODE HERE ###

df.groupby('label')['km_per_driving_day'].median()
Out[35]:
label
churned     697.541999
retained    289.549333
Name: km_per_driving_day, dtype: float64
In [36]:
# Add a column to df called `drives_per_driving_day`
### YOUR CODE HERE ###

df['drives_per_driving_day'] = df['drives'] / df['driving_days']

# Group by `label`, calculate the median, and isolate for drives per driving day
### YOUR CODE HERE ###

df.groupby('label')['drives_per_driving_day'].median()
Out[36]:
label
churned     10.0000
retained     4.0625
Name: drives_per_driving_day, dtype: float64
In [37]:
# For each label, calculate the number of Android users and iPhone users
### YOUR CODE HERE ###

df.groupby('label')['device'].value_counts()
Out[37]:
label     device 
churned   iPhone     1645
          Android     891
retained  iPhone     7580
          Android    4183
Name: count, dtype: int64
In [38]:
# For each label, calculate the percentage of Android users and iPhone users
### YOUR CODE HERE ###

# Percentage of each device type within each label group
df.groupby('label')['device'].value_counts(normalize=True) * 100
Out[38]:
label     device 
churned   iPhone     64.865931
          Android    35.134069
retained  iPhone     64.439344
          Android    35.560656
Name: proportion, dtype: float64
In [ ]: