In [5]:
import pandas as pd
In [3]:
import pandas as pd
df = pd.read_csv("waze_dataset.csv")
In [7]:
df.head()
df.shape
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 14999 entries, 0 to 14998 Data columns (total 13 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ID 14999 non-null int64 1 label 14299 non-null object 2 sessions 14999 non-null int64 3 drives 14999 non-null int64 4 total_sessions 14999 non-null float64 5 n_days_after_onboarding 14999 non-null int64 6 total_navigations_fav1 14999 non-null int64 7 total_navigations_fav2 14999 non-null int64 8 driven_km_drives 14999 non-null float64 9 duration_minutes_drives 14999 non-null float64 10 activity_days 14999 non-null int64 11 driving_days 14999 non-null int64 12 device 14999 non-null object dtypes: float64(3), int64(8), object(2) memory usage: 1.5+ MB
In [8]:
df.head(10)
Out[8]:
| ID | label | sessions | drives | total_sessions | n_days_after_onboarding | total_navigations_fav1 | total_navigations_fav2 | driven_km_drives | duration_minutes_drives | activity_days | driving_days | device | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | retained | 283 | 226 | 296.748273 | 2276 | 208 | 0 | 2628.845068 | 1985.775061 | 28 | 19 | Android |
| 1 | 1 | retained | 133 | 107 | 326.896596 | 1225 | 19 | 64 | 13715.920550 | 3160.472914 | 13 | 11 | iPhone |
| 2 | 2 | retained | 114 | 95 | 135.522926 | 2651 | 0 | 0 | 3059.148818 | 1610.735904 | 14 | 8 | Android |
| 3 | 3 | retained | 49 | 40 | 67.589221 | 15 | 322 | 7 | 913.591123 | 587.196542 | 7 | 3 | iPhone |
| 4 | 4 | retained | 84 | 68 | 168.247020 | 1562 | 166 | 5 | 3950.202008 | 1219.555924 | 27 | 18 | Android |
| 5 | 5 | retained | 113 | 103 | 279.544437 | 2637 | 0 | 0 | 901.238699 | 439.101397 | 15 | 11 | iPhone |
| 6 | 6 | retained | 3 | 2 | 236.725314 | 360 | 185 | 18 | 5249.172828 | 726.577205 | 28 | 23 | iPhone |
| 7 | 7 | retained | 39 | 35 | 176.072845 | 2999 | 0 | 0 | 7892.052468 | 2466.981741 | 22 | 20 | iPhone |
| 8 | 8 | retained | 57 | 46 | 183.532018 | 424 | 0 | 26 | 2651.709764 | 1594.342984 | 25 | 20 | Android |
| 9 | 9 | churned | 84 | 68 | 244.802115 | 2997 | 72 | 0 | 6043.460295 | 2341.838528 | 7 | 3 | iPhone |
In [9]:
df.isna().sum()
Out[9]:
ID 0 label 700 sessions 0 drives 0 total_sessions 0 n_days_after_onboarding 0 total_navigations_fav1 0 total_navigations_fav2 0 driven_km_drives 0 duration_minutes_drives 0 activity_days 0 driving_days 0 device 0 dtype: int64
In [10]:
# Number of non-missing labels
df['label'].notna().sum()
Out[10]:
np.int64(14299)
In [12]:
#labell distributtion #
df['label'].value_counts()
Out[12]:
label retained 11763 churned 2536 Name: count, dtype: int64
In [13]:
# Rows with missing label
missing_label = df[df['label'].isna()]
# Rows with label present
label_present = df[df['label'].notna()]
In [14]:
# Summary for rows with missing label
missing_label.describe()
Out[14]:
| ID | sessions | drives | total_sessions | n_days_after_onboarding | total_navigations_fav1 | total_navigations_fav2 | driven_km_drives | duration_minutes_drives | activity_days | driving_days | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 700.000000 | 700.000000 | 700.000000 | 700.000000 | 700.000000 | 700.000000 | 700.000000 | 700.000000 | 700.000000 | 700.000000 | 700.000000 |
| mean | 7405.584286 | 80.837143 | 67.798571 | 198.483348 | 1709.295714 | 118.717143 | 30.371429 | 3935.967029 | 1795.123358 | 15.382857 | 12.125714 |
| std | 4306.900234 | 79.987440 | 65.271926 | 140.561715 | 1005.306562 | 156.308140 | 46.306984 | 2443.107121 | 1419.242246 | 8.772714 | 7.626373 |
| min | 77.000000 | 0.000000 | 0.000000 | 5.582648 | 16.000000 | 0.000000 | 0.000000 | 290.119811 | 66.588493 | 0.000000 | 0.000000 |
| 25% | 3744.500000 | 23.000000 | 20.000000 | 94.056340 | 869.000000 | 4.000000 | 0.000000 | 2119.344818 | 779.009271 | 8.000000 | 6.000000 |
| 50% | 7443.000000 | 56.000000 | 47.500000 | 177.255925 | 1650.500000 | 62.500000 | 10.000000 | 3421.156721 | 1414.966279 | 15.000000 | 12.000000 |
| 75% | 11007.000000 | 112.250000 | 94.000000 | 266.058022 | 2508.750000 | 169.250000 | 43.000000 | 5166.097373 | 2443.955404 | 23.000000 | 18.000000 |
| max | 14993.000000 | 556.000000 | 445.000000 | 1076.879741 | 3498.000000 | 1096.000000 | 352.000000 | 15135.391280 | 9746.253023 | 31.000000 | 30.000000 |
In [15]:
# Summary for rows with label present
label_present.describe()
Out[15]:
| ID | sessions | drives | total_sessions | n_days_after_onboarding | total_navigations_fav1 | total_navigations_fav2 | driven_km_drives | duration_minutes_drives | activity_days | driving_days | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 14299.000000 | 14299.000000 | 14299.000000 | 14299.000000 | 14299.000000 | 14299.000000 | 14299.000000 | 14299.000000 | 14299.000000 | 14299.000000 | 14299.000000 |
| mean | 7503.573117 | 80.623820 | 67.255822 | 189.547409 | 1751.822505 | 121.747395 | 29.638296 | 4044.401535 | 1864.199794 | 15.544653 | 12.182530 |
| std | 4331.207621 | 80.736502 | 65.947295 | 136.189764 | 1008.663834 | 147.713428 | 45.350890 | 2504.977970 | 1448.005047 | 9.016088 | 7.833835 |
| min | 0.000000 | 0.000000 | 0.000000 | 0.220211 | 4.000000 | 0.000000 | 0.000000 | 60.441250 | 18.282082 | 0.000000 | 0.000000 |
| 25% | 3749.500000 | 23.000000 | 20.000000 | 90.457733 | 878.500000 | 10.000000 | 0.000000 | 2217.319909 | 840.181344 | 8.000000 | 5.000000 |
| 50% | 7504.000000 | 56.000000 | 48.000000 | 158.718571 | 1749.000000 | 71.000000 | 9.000000 | 3496.545617 | 1479.394387 | 16.000000 | 12.000000 |
| 75% | 11257.500000 | 111.000000 | 93.000000 | 253.540450 | 2627.500000 | 178.000000 | 43.000000 | 5299.972162 | 2466.928876 | 23.000000 | 19.000000 |
| max | 14998.000000 | 743.000000 | 596.000000 | 1216.154633 | 3500.000000 | 1236.000000 | 415.000000 | 21183.401890 | 15851.727160 | 31.000000 | 30.000000 |
In [16]:
summary_comparison = pd.concat([missing_label.describe(), label_present.describe()], axis=1, keys=['Missing Label', 'Label Present'])
summary_comparison
Out[16]:
| Missing Label | ... | Label Present | |||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| ID | sessions | drives | total_sessions | n_days_after_onboarding | total_navigations_fav1 | total_navigations_fav2 | driven_km_drives | duration_minutes_drives | activity_days | ... | sessions | drives | total_sessions | n_days_after_onboarding | total_navigations_fav1 | total_navigations_fav2 | driven_km_drives | duration_minutes_drives | activity_days | driving_days | |
| count | 700.000000 | 700.000000 | 700.000000 | 700.000000 | 700.000000 | 700.000000 | 700.000000 | 700.000000 | 700.000000 | 700.000000 | ... | 14299.000000 | 14299.000000 | 14299.000000 | 14299.000000 | 14299.000000 | 14299.000000 | 14299.000000 | 14299.000000 | 14299.000000 | 14299.000000 |
| mean | 7405.584286 | 80.837143 | 67.798571 | 198.483348 | 1709.295714 | 118.717143 | 30.371429 | 3935.967029 | 1795.123358 | 15.382857 | ... | 80.623820 | 67.255822 | 189.547409 | 1751.822505 | 121.747395 | 29.638296 | 4044.401535 | 1864.199794 | 15.544653 | 12.182530 |
| std | 4306.900234 | 79.987440 | 65.271926 | 140.561715 | 1005.306562 | 156.308140 | 46.306984 | 2443.107121 | 1419.242246 | 8.772714 | ... | 80.736502 | 65.947295 | 136.189764 | 1008.663834 | 147.713428 | 45.350890 | 2504.977970 | 1448.005047 | 9.016088 | 7.833835 |
| min | 77.000000 | 0.000000 | 0.000000 | 5.582648 | 16.000000 | 0.000000 | 0.000000 | 290.119811 | 66.588493 | 0.000000 | ... | 0.000000 | 0.000000 | 0.220211 | 4.000000 | 0.000000 | 0.000000 | 60.441250 | 18.282082 | 0.000000 | 0.000000 |
| 25% | 3744.500000 | 23.000000 | 20.000000 | 94.056340 | 869.000000 | 4.000000 | 0.000000 | 2119.344818 | 779.009271 | 8.000000 | ... | 23.000000 | 20.000000 | 90.457733 | 878.500000 | 10.000000 | 0.000000 | 2217.319909 | 840.181344 | 8.000000 | 5.000000 |
| 50% | 7443.000000 | 56.000000 | 47.500000 | 177.255925 | 1650.500000 | 62.500000 | 10.000000 | 3421.156721 | 1414.966279 | 15.000000 | ... | 56.000000 | 48.000000 | 158.718571 | 1749.000000 | 71.000000 | 9.000000 | 3496.545617 | 1479.394387 | 16.000000 | 12.000000 |
| 75% | 11007.000000 | 112.250000 | 94.000000 | 266.058022 | 2508.750000 | 169.250000 | 43.000000 | 5166.097373 | 2443.955404 | 23.000000 | ... | 111.000000 | 93.000000 | 253.540450 | 2627.500000 | 178.000000 | 43.000000 | 5299.972162 | 2466.928876 | 23.000000 | 19.000000 |
| max | 14993.000000 | 556.000000 | 445.000000 | 1076.879741 | 3498.000000 | 1096.000000 | 352.000000 | 15135.391280 | 9746.253023 | 31.000000 | ... | 743.000000 | 596.000000 | 1216.154633 | 3500.000000 | 1236.000000 | 415.000000 | 21183.401890 | 15851.727160 | 31.000000 | 30.000000 |
8 rows × 22 columns
In [17]:
# Isolate rows where 'label' is missing
missing_label_rows = df[df['label'].isna()]
# View the first few rows
missing_label_rows.head()
Out[17]:
| ID | label | sessions | drives | total_sessions | n_days_after_onboarding | total_navigations_fav1 | total_navigations_fav2 | driven_km_drives | duration_minutes_drives | activity_days | driving_days | device | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 77 | 77 | NaN | 63 | 50 | 133.104155 | 783 | 201 | 0 | 2649.015822 | 1517.209970 | 19 | 13 | iPhone |
| 80 | 80 | NaN | 116 | 93 | 436.060183 | 1584 | 283 | 62 | 4183.409514 | 3121.889952 | 18 | 15 | iPhone |
| 98 | 98 | NaN | 78 | 64 | 583.492789 | 3414 | 0 | 0 | 1811.140893 | 642.189122 | 12 | 11 | Android |
| 111 | 111 | NaN | 106 | 102 | 113.379056 | 2228 | 14 | 0 | 2817.481840 | 2011.724274 | 17 | 13 | Android |
| 142 | 142 | NaN | 32 | 26 | 222.129310 | 208 | 55 | 10 | 2459.816477 | 874.427617 | 11 | 7 | iPhone |
In [18]:
# Isolate rows where 'label' is NOT missing
label_present_rows = df[df['label'].notna()]
# Display summary statistics for numeric columns
label_present_rows.describe()
Out[18]:
| ID | sessions | drives | total_sessions | n_days_after_onboarding | total_navigations_fav1 | total_navigations_fav2 | driven_km_drives | duration_minutes_drives | activity_days | driving_days | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 14299.000000 | 14299.000000 | 14299.000000 | 14299.000000 | 14299.000000 | 14299.000000 | 14299.000000 | 14299.000000 | 14299.000000 | 14299.000000 | 14299.000000 |
| mean | 7503.573117 | 80.623820 | 67.255822 | 189.547409 | 1751.822505 | 121.747395 | 29.638296 | 4044.401535 | 1864.199794 | 15.544653 | 12.182530 |
| std | 4331.207621 | 80.736502 | 65.947295 | 136.189764 | 1008.663834 | 147.713428 | 45.350890 | 2504.977970 | 1448.005047 | 9.016088 | 7.833835 |
| min | 0.000000 | 0.000000 | 0.000000 | 0.220211 | 4.000000 | 0.000000 | 0.000000 | 60.441250 | 18.282082 | 0.000000 | 0.000000 |
| 25% | 3749.500000 | 23.000000 | 20.000000 | 90.457733 | 878.500000 | 10.000000 | 0.000000 | 2217.319909 | 840.181344 | 8.000000 | 5.000000 |
| 50% | 7504.000000 | 56.000000 | 48.000000 | 158.718571 | 1749.000000 | 71.000000 | 9.000000 | 3496.545617 | 1479.394387 | 16.000000 | 12.000000 |
| 75% | 11257.500000 | 111.000000 | 93.000000 | 253.540450 | 2627.500000 | 178.000000 | 43.000000 | 5299.972162 | 2466.928876 | 23.000000 | 19.000000 |
| max | 14998.000000 | 743.000000 | 596.000000 | 1216.154633 | 3500.000000 | 1236.000000 | 415.000000 | 21183.401890 | 15851.727160 | 31.000000 | 30.000000 |
In [23]:
#Rows with missing labels#
df[df['label'].isna()].describe()
Out[23]:
| ID | sessions | drives | total_sessions | n_days_after_onboarding | total_navigations_fav1 | total_navigations_fav2 | driven_km_drives | duration_minutes_drives | activity_days | driving_days | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 700.000000 | 700.000000 | 700.000000 | 700.000000 | 700.000000 | 700.000000 | 700.000000 | 700.000000 | 700.000000 | 700.000000 | 700.000000 |
| mean | 7405.584286 | 80.837143 | 67.798571 | 198.483348 | 1709.295714 | 118.717143 | 30.371429 | 3935.967029 | 1795.123358 | 15.382857 | 12.125714 |
| std | 4306.900234 | 79.987440 | 65.271926 | 140.561715 | 1005.306562 | 156.308140 | 46.306984 | 2443.107121 | 1419.242246 | 8.772714 | 7.626373 |
| min | 77.000000 | 0.000000 | 0.000000 | 5.582648 | 16.000000 | 0.000000 | 0.000000 | 290.119811 | 66.588493 | 0.000000 | 0.000000 |
| 25% | 3744.500000 | 23.000000 | 20.000000 | 94.056340 | 869.000000 | 4.000000 | 0.000000 | 2119.344818 | 779.009271 | 8.000000 | 6.000000 |
| 50% | 7443.000000 | 56.000000 | 47.500000 | 177.255925 | 1650.500000 | 62.500000 | 10.000000 | 3421.156721 | 1414.966279 | 15.000000 | 12.000000 |
| 75% | 11007.000000 | 112.250000 | 94.000000 | 266.058022 | 2508.750000 | 169.250000 | 43.000000 | 5166.097373 | 2443.955404 | 23.000000 | 18.000000 |
| max | 14993.000000 | 556.000000 | 445.000000 | 1076.879741 | 3498.000000 | 1096.000000 | 352.000000 | 15135.391280 | 9746.253023 | 31.000000 | 30.000000 |
In [22]:
#Rows with labels present#
df[df['label'].notna()].describe()
Out[22]:
| ID | sessions | drives | total_sessions | n_days_after_onboarding | total_navigations_fav1 | total_navigations_fav2 | driven_km_drives | duration_minutes_drives | activity_days | driving_days | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 14299.000000 | 14299.000000 | 14299.000000 | 14299.000000 | 14299.000000 | 14299.000000 | 14299.000000 | 14299.000000 | 14299.000000 | 14299.000000 | 14299.000000 |
| mean | 7503.573117 | 80.623820 | 67.255822 | 189.547409 | 1751.822505 | 121.747395 | 29.638296 | 4044.401535 | 1864.199794 | 15.544653 | 12.182530 |
| std | 4331.207621 | 80.736502 | 65.947295 | 136.189764 | 1008.663834 | 147.713428 | 45.350890 | 2504.977970 | 1448.005047 | 9.016088 | 7.833835 |
| min | 0.000000 | 0.000000 | 0.000000 | 0.220211 | 4.000000 | 0.000000 | 0.000000 | 60.441250 | 18.282082 | 0.000000 | 0.000000 |
| 25% | 3749.500000 | 23.000000 | 20.000000 | 90.457733 | 878.500000 | 10.000000 | 0.000000 | 2217.319909 | 840.181344 | 8.000000 | 5.000000 |
| 50% | 7504.000000 | 56.000000 | 48.000000 | 158.718571 | 1749.000000 | 71.000000 | 9.000000 | 3496.545617 | 1479.394387 | 16.000000 | 12.000000 |
| 75% | 11257.500000 | 111.000000 | 93.000000 | 253.540450 | 2627.500000 | 178.000000 | 43.000000 | 5299.972162 | 2466.928876 | 23.000000 | 19.000000 |
| max | 14998.000000 | 743.000000 | 596.000000 | 1216.154633 | 3500.000000 | 1236.000000 | 415.000000 | 21183.401890 | 15851.727160 | 31.000000 | 30.000000 |
In [25]:
#Observations The mean and median are very close between the two groups (100 difference on a scale of 7500). This suggests no strong difference in central tendency. Min & Max -Both populations cover almost the full range of IDs (0–14998), so extremes are similar. Standard Deviation Spread is almost identical (~4300), meaning variability is very similar. Conclusion Based on these summary statistics, there is no discernible difference between the rows with missing labels and those with labels present. The populations appear to be roughly similar in terms of numeric distribution.#
In [28]:
# Count device types where label is missing #
df[df['label'].isna()]['device'].value_counts()
Out[28]:
device iPhone 447 Android 253 Name: count, dtype: int64
In [29]:
# Count device types where label is missing #
df[df['label'].isna()].groupby('device').size()
Out[29]:
device Android 253 iPhone 447 dtype: int64
In [30]:
# Percentage of each device among rows with missing labels#
df[df['label'].isna()]['device'].value_counts(normalize=True) * 100
Out[30]:
device iPhone 63.857143 Android 36.142857 Name: proportion, dtype: float64
In [31]:
# Device ratio in the full dataset##
df['device'].value_counts(normalize=True) * 100
Out[31]:
device iPhone 64.484299 Android 35.515701 Name: proportion, dtype: float64
In [32]:
# Calculate counts of churned vs. retained
### YOUR CODE HERE ###
df['label'].value_counts()
df['label'].value_counts(normalize=True) * 100
Out[32]:
label retained 82.264494 churned 17.735506 Name: proportion, dtype: float64
In [33]:
# Median values for each variable by label #
df.groupby('label').median(numeric_only=True)
Out[33]:
| ID | sessions | drives | total_sessions | n_days_after_onboarding | total_navigations_fav1 | total_navigations_fav2 | driven_km_drives | duration_minutes_drives | activity_days | driving_days | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| label | |||||||||||
| churned | 7477.5 | 59.0 | 50.0 | 164.339042 | 1321.0 | 84.5 | 11.0 | 3652.655666 | 1607.183785 | 8.0 | 6.0 |
| retained | 7509.0 | 56.0 | 47.0 | 157.586756 | 1843.0 | 68.0 | 9.0 | 3464.684614 | 1458.046141 | 17.0 | 14.0 |
In [34]:
# Add a column to df called `km_per_drive`
### YOUR CODE HERE ###
df['km_per_drive'] = df['driven_km_drives'] / df['drives']
# Group by `label`, calculate the median, and isolate for km per drive
### YOUR CODE HERE ###
df.groupby('label')['km_per_drive'].median()
Out[34]:
label churned 74.109416 retained 75.014702 Name: km_per_drive, dtype: float64
In [35]:
# Add a column to df called `km_per_driving_day`
### YOUR CODE HERE ###
df['km_per_driving_day'] = df['driven_km_drives'] / df['driving_days']
# Group by `label`, calculate the median, and isolate for km per driving day
### YOUR CODE HERE ###
df.groupby('label')['km_per_driving_day'].median()
Out[35]:
label churned 697.541999 retained 289.549333 Name: km_per_driving_day, dtype: float64
In [36]:
# Add a column to df called `drives_per_driving_day`
### YOUR CODE HERE ###
df['drives_per_driving_day'] = df['drives'] / df['driving_days']
# Group by `label`, calculate the median, and isolate for drives per driving day
### YOUR CODE HERE ###
df.groupby('label')['drives_per_driving_day'].median()
Out[36]:
label churned 10.0000 retained 4.0625 Name: drives_per_driving_day, dtype: float64
In [37]:
# For each label, calculate the number of Android users and iPhone users
### YOUR CODE HERE ###
df.groupby('label')['device'].value_counts()
Out[37]:
label device
churned iPhone 1645
Android 891
retained iPhone 7580
Android 4183
Name: count, dtype: int64
In [38]:
# For each label, calculate the percentage of Android users and iPhone users
### YOUR CODE HERE ###
# Percentage of each device type within each label group
df.groupby('label')['device'].value_counts(normalize=True) * 100
Out[38]:
label device
churned iPhone 64.865931
Android 35.134069
retained iPhone 64.439344
Android 35.560656
Name: proportion, dtype: float64
In [ ]: