import pandas as pd

import pandas as pd


df = pd.read_csv("waze_dataset.csv")

df.head()
df.shape
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14999 entries, 0 to 14998
Data columns (total 13 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   ID                       14999 non-null  int64  
 1   label                    14299 non-null  object 
 2   sessions                 14999 non-null  int64  
 3   drives                   14999 non-null  int64  
 4   total_sessions           14999 non-null  float64
 5   n_days_after_onboarding  14999 non-null  int64  
 6   total_navigations_fav1   14999 non-null  int64  
 7   total_navigations_fav2   14999 non-null  int64  
 8   driven_km_drives         14999 non-null  float64
 9   duration_minutes_drives  14999 non-null  float64
 10  activity_days            14999 non-null  int64  
 11  driving_days             14999 non-null  int64  
 12  device                   14999 non-null  object 
dtypes: float64(3), int64(8), object(2)
memory usage: 1.5+ MB

df.head(10)

df.isna().sum()

ID                           0
label                      700
sessions                     0
drives                       0
total_sessions               0
n_days_after_onboarding      0
total_navigations_fav1       0
total_navigations_fav2       0
driven_km_drives             0
duration_minutes_drives      0
activity_days                0
driving_days                 0
device                       0
dtype: int64

# Number of non-missing labels
df['label'].notna().sum()

np.int64(14299)

#labell distributtion #
df['label'].value_counts()

label
retained    11763
churned      2536
Name: count, dtype: int64

# Rows with missing label
missing_label = df[df['label'].isna()]

# Rows with label present
label_present = df[df['label'].notna()]

# Summary for rows with missing label
missing_label.describe()

# Summary for rows with label present
label_present.describe()

summary_comparison = pd.concat([missing_label.describe(), label_present.describe()], axis=1, keys=['Missing Label', 'Label Present'])
summary_comparison

# Isolate rows where 'label' is missing
missing_label_rows = df[df['label'].isna()]

# View the first few rows
missing_label_rows.head()

# Isolate rows where 'label' is NOT missing
label_present_rows = df[df['label'].notna()]

# Display summary statistics for numeric columns
label_present_rows.describe()

#Rows with missing labels#
df[df['label'].isna()].describe()

#Rows with labels present#
df[df['label'].notna()].describe()

#Observations The mean and median are very close between the two groups (100 difference on a scale of 7500). This suggests no strong difference in central tendency. Min & Max -Both populations cover almost the full range of IDs (0–14998), so extremes are similar. Standard Deviation Spread is almost identical (~4300), meaning variability is very similar. Conclusion Based on these summary statistics, there is no discernible difference between the rows with missing labels and those with labels present. The populations appear to be roughly similar in terms of numeric distribution.#

# Count device types where label is missing #
df[df['label'].isna()]['device'].value_counts()

device
iPhone     447
Android    253
Name: count, dtype: int64

# Count device types where label is missing #
df[df['label'].isna()].groupby('device').size()

device
Android    253
iPhone     447
dtype: int64

# Percentage of each device among rows with missing labels#

df[df['label'].isna()]['device'].value_counts(normalize=True) * 100

device
iPhone     63.857143
Android    36.142857
Name: proportion, dtype: float64

# Device ratio in the full dataset##

df['device'].value_counts(normalize=True) * 100

device
iPhone     64.484299
Android    35.515701
Name: proportion, dtype: float64

# Calculate counts of churned vs. retained
### YOUR CODE HERE ###

df['label'].value_counts()
df['label'].value_counts(normalize=True) * 100

label
retained    82.264494
churned     17.735506
Name: proportion, dtype: float64

# Median values for each variable by label #
df.groupby('label').median(numeric_only=True)

# Add a column to df called `km_per_drive`
### YOUR CODE HERE ###

df['km_per_drive'] = df['driven_km_drives'] / df['drives']

# Group by `label`, calculate the median, and isolate for km per drive
### YOUR CODE HERE ###

df.groupby('label')['km_per_drive'].median()

label
churned     74.109416
retained    75.014702
Name: km_per_drive, dtype: float64

# Add a column to df called `km_per_driving_day`
### YOUR CODE HERE ###

df['km_per_driving_day'] = df['driven_km_drives'] / df['driving_days']


# Group by `label`, calculate the median, and isolate for km per driving day
### YOUR CODE HERE ###

df.groupby('label')['km_per_driving_day'].median()

label
churned     697.541999
retained    289.549333
Name: km_per_driving_day, dtype: float64

# Add a column to df called `drives_per_driving_day`
### YOUR CODE HERE ###

df['drives_per_driving_day'] = df['drives'] / df['driving_days']

# Group by `label`, calculate the median, and isolate for drives per driving day
### YOUR CODE HERE ###

df.groupby('label')['drives_per_driving_day'].median()

label
churned     10.0000
retained     4.0625
Name: drives_per_driving_day, dtype: float64

# For each label, calculate the number of Android users and iPhone users
### YOUR CODE HERE ###

df.groupby('label')['device'].value_counts()

label     device 
churned   iPhone     1645
          Android     891
retained  iPhone     7580
          Android    4183
Name: count, dtype: int64

# For each label, calculate the percentage of Android users and iPhone users
### YOUR CODE HERE ###

# Percentage of each device type within each label group
df.groupby('label')['device'].value_counts(normalize=True) * 100

label     device 
churned   iPhone     64.865931
          Android    35.134069
retained  iPhone     64.439344
          Android    35.560656
Name: proportion, dtype: float64

	ID	label	sessions	drives	total_sessions	n_days_after_onboarding	total_navigations_fav1	total_navigations_fav2	driven_km_drives	duration_minutes_drives	activity_days	driving_days	device
0	0	retained	283	226	296.748273	2276	208	0	2628.845068	1985.775061	28	19	Android
1	1	retained	133	107	326.896596	1225	19	64	13715.920550	3160.472914	13	11	iPhone
2	2	retained	114	95	135.522926	2651	0	0	3059.148818	1610.735904	14	8	Android
3	3	retained	49	40	67.589221	15	322	7	913.591123	587.196542	7	3	iPhone
4	4	retained	84	68	168.247020	1562	166	5	3950.202008	1219.555924	27	18	Android
5	5	retained	113	103	279.544437	2637	0	0	901.238699	439.101397	15	11	iPhone
6	6	retained	3	2	236.725314	360	185	18	5249.172828	726.577205	28	23	iPhone
7	7	retained	39	35	176.072845	2999	0	0	7892.052468	2466.981741	22	20	iPhone
8	8	retained	57	46	183.532018	424	0	26	2651.709764	1594.342984	25	20	Android
9	9	churned	84	68	244.802115	2997	72	0	6043.460295	2341.838528	7	3	iPhone

	ID	sessions	drives	total_sessions	n_days_after_onboarding	total_navigations_fav1	total_navigations_fav2	driven_km_drives	duration_minutes_drives	activity_days	driving_days
count	700.000000	700.000000	700.000000	700.000000	700.000000	700.000000	700.000000	700.000000	700.000000	700.000000	700.000000
mean	7405.584286	80.837143	67.798571	198.483348	1709.295714	118.717143	30.371429	3935.967029	1795.123358	15.382857	12.125714
std	4306.900234	79.987440	65.271926	140.561715	1005.306562	156.308140	46.306984	2443.107121	1419.242246	8.772714	7.626373
min	77.000000	0.000000	0.000000	5.582648	16.000000	0.000000	0.000000	290.119811	66.588493	0.000000	0.000000
25%	3744.500000	23.000000	20.000000	94.056340	869.000000	4.000000	0.000000	2119.344818	779.009271	8.000000	6.000000
50%	7443.000000	56.000000	47.500000	177.255925	1650.500000	62.500000	10.000000	3421.156721	1414.966279	15.000000	12.000000
75%	11007.000000	112.250000	94.000000	266.058022	2508.750000	169.250000	43.000000	5166.097373	2443.955404	23.000000	18.000000
max	14993.000000	556.000000	445.000000	1076.879741	3498.000000	1096.000000	352.000000	15135.391280	9746.253023	31.000000	30.000000

	ID	sessions	drives	total_sessions	n_days_after_onboarding	total_navigations_fav1	total_navigations_fav2	driven_km_drives	duration_minutes_drives	activity_days	driving_days
count	14299.000000	14299.000000	14299.000000	14299.000000	14299.000000	14299.000000	14299.000000	14299.000000	14299.000000	14299.000000	14299.000000
mean	7503.573117	80.623820	67.255822	189.547409	1751.822505	121.747395	29.638296	4044.401535	1864.199794	15.544653	12.182530
std	4331.207621	80.736502	65.947295	136.189764	1008.663834	147.713428	45.350890	2504.977970	1448.005047	9.016088	7.833835
min	0.000000	0.000000	0.000000	0.220211	4.000000	0.000000	0.000000	60.441250	18.282082	0.000000	0.000000
25%	3749.500000	23.000000	20.000000	90.457733	878.500000	10.000000	0.000000	2217.319909	840.181344	8.000000	5.000000
50%	7504.000000	56.000000	48.000000	158.718571	1749.000000	71.000000	9.000000	3496.545617	1479.394387	16.000000	12.000000
75%	11257.500000	111.000000	93.000000	253.540450	2627.500000	178.000000	43.000000	5299.972162	2466.928876	23.000000	19.000000
max	14998.000000	743.000000	596.000000	1216.154633	3500.000000	1236.000000	415.000000	21183.401890	15851.727160	31.000000	30.000000

	Missing Label										...	Label Present
	ID	sessions	drives	total_sessions	n_days_after_onboarding	total_navigations_fav1	total_navigations_fav2	driven_km_drives	duration_minutes_drives	activity_days	...	sessions	drives	total_sessions	n_days_after_onboarding	total_navigations_fav1	total_navigations_fav2	driven_km_drives	duration_minutes_drives	activity_days	driving_days
count	700.000000	700.000000	700.000000	700.000000	700.000000	700.000000	700.000000	700.000000	700.000000	700.000000	...	14299.000000	14299.000000	14299.000000	14299.000000	14299.000000	14299.000000	14299.000000	14299.000000	14299.000000	14299.000000
mean	7405.584286	80.837143	67.798571	198.483348	1709.295714	118.717143	30.371429	3935.967029	1795.123358	15.382857	...	80.623820	67.255822	189.547409	1751.822505	121.747395	29.638296	4044.401535	1864.199794	15.544653	12.182530
std	4306.900234	79.987440	65.271926	140.561715	1005.306562	156.308140	46.306984	2443.107121	1419.242246	8.772714	...	80.736502	65.947295	136.189764	1008.663834	147.713428	45.350890	2504.977970	1448.005047	9.016088	7.833835
min	77.000000	0.000000	0.000000	5.582648	16.000000	0.000000	0.000000	290.119811	66.588493	0.000000	...	0.000000	0.000000	0.220211	4.000000	0.000000	0.000000	60.441250	18.282082	0.000000	0.000000
25%	3744.500000	23.000000	20.000000	94.056340	869.000000	4.000000	0.000000	2119.344818	779.009271	8.000000	...	23.000000	20.000000	90.457733	878.500000	10.000000	0.000000	2217.319909	840.181344	8.000000	5.000000
50%	7443.000000	56.000000	47.500000	177.255925	1650.500000	62.500000	10.000000	3421.156721	1414.966279	15.000000	...	56.000000	48.000000	158.718571	1749.000000	71.000000	9.000000	3496.545617	1479.394387	16.000000	12.000000
75%	11007.000000	112.250000	94.000000	266.058022	2508.750000	169.250000	43.000000	5166.097373	2443.955404	23.000000	...	111.000000	93.000000	253.540450	2627.500000	178.000000	43.000000	5299.972162	2466.928876	23.000000	19.000000
max	14993.000000	556.000000	445.000000	1076.879741	3498.000000	1096.000000	352.000000	15135.391280	9746.253023	31.000000	...	743.000000	596.000000	1216.154633	3500.000000	1236.000000	415.000000	21183.401890	15851.727160	31.000000	30.000000

	ID	sessions	drives	total_sessions	n_days_after_onboarding	total_navigations_fav1	total_navigations_fav2	driven_km_drives	duration_minutes_drives	activity_days	driving_days
count	14299.000000	14299.000000	14299.000000	14299.000000	14299.000000	14299.000000	14299.000000	14299.000000	14299.000000	14299.000000	14299.000000
mean	7503.573117	80.623820	67.255822	189.547409	1751.822505	121.747395	29.638296	4044.401535	1864.199794	15.544653	12.182530
std	4331.207621	80.736502	65.947295	136.189764	1008.663834	147.713428	45.350890	2504.977970	1448.005047	9.016088	7.833835
min	0.000000	0.000000	0.000000	0.220211	4.000000	0.000000	0.000000	60.441250	18.282082	0.000000	0.000000
25%	3749.500000	23.000000	20.000000	90.457733	878.500000	10.000000	0.000000	2217.319909	840.181344	8.000000	5.000000
50%	7504.000000	56.000000	48.000000	158.718571	1749.000000	71.000000	9.000000	3496.545617	1479.394387	16.000000	12.000000
75%	11257.500000	111.000000	93.000000	253.540450	2627.500000	178.000000	43.000000	5299.972162	2466.928876	23.000000	19.000000
max	14998.000000	743.000000	596.000000	1216.154633	3500.000000	1236.000000	415.000000	21183.401890	15851.727160	31.000000	30.000000

	ID	label	sessions	drives	total_sessions	n_days_after_onboarding	total_navigations_fav1	total_navigations_fav2	driven_km_drives	duration_minutes_drives	activity_days	driving_days	device
77	77	NaN	63	50	133.104155	783	201	0	2649.015822	1517.209970	19	13	iPhone
80	80	NaN	116	93	436.060183	1584	283	62	4183.409514	3121.889952	18	15	iPhone
98	98	NaN	78	64	583.492789	3414	0	0	1811.140893	642.189122	12	11	Android
111	111	NaN	106	102	113.379056	2228	14	0	2817.481840	2011.724274	17	13	Android
142	142	NaN	32	26	222.129310	208	55	10	2459.816477	874.427617	11	7	iPhone

	ID	sessions	drives	total_sessions	n_days_after_onboarding	total_navigations_fav1	total_navigations_fav2	driven_km_drives	duration_minutes_drives	activity_days	driving_days
label
churned	7477.5	59.0	50.0	164.339042	1321.0	84.5	11.0	3652.655666	1607.183785	8.0	6.0
retained	7509.0	56.0	47.0	157.586756	1843.0	68.0	9.0	3464.684614	1458.046141	17.0	14.0