# Data manipulation and analysis
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Optional: improve plot aesthetics
sns.set_style("whitegrid")
plt.rcParams["figure.figsize"] = (10, 6)

# Optional: for interactive plots
# import plotly.express as px

# Optional: ignore warnings for cleaner output
import warnings
warnings.filterwarnings("ignore")

# Load the dataset
df = pd.read_csv("Unicorn_Companies.csv")

# Display the first few rows to check the data
df.head()

# Display the first 10 rows of the dataset
df.head(10)

# Identify the number of rows and columns in the dataset.

### YOUR CODE HERE ###

# Get the number of rows and columns
df.shape

(1074, 10)

# Check for duplicate rows
duplicate_rows = df.duplicated()

df.drop_duplicates().shape

(1074, 10)

# Display the data types of each column
df.dtypes

Company             object
Valuation           object
Date Joined         object
Industry            object
City                object
Country/Region      object
Continent           object
Year Founded         int64
Funding             object
Select Investors    object
dtype: object

# Check the column names
print(df.columns)

Index(['Company', 'Valuation', 'Date Joined', 'Industry', 'City',
       'Country/Region', 'Continent', 'Year Founded', 'Funding',
       'Select Investors'],
      dtype='object')

# Sort the DataFrame by 'Year Founded' in ascending order (earliest founded first)
df_sorted = df.sort_values(by='Year Founded', ascending=True)

# Display the first 10 rows
df_sorted.head(10)

# Display each unique year that occurs in the dataset
# along with the number of companies that were founded in each unique year.

### YOUR CODE HERE ###

companies_per_year = df.groupby('Year Founded')['Company'].count()
print(companies_per_year)

Year Founded
1919      1
1979      1
1984      1
1990      1
1991      1
1992      1
1993      1
1994      2
1995      2
1996      1
1997      1
1998      5
1999      8
2000     11
2001      9
2002      4
2003      8
2004      8
2005     14
2006     15
2007     24
2008     27
2009     34
2010     40
2011     82
2012     95
2013     87
2014    109
2015    155
2016    110
2017     74
2018     61
2019     45
2020     25
2021     11
Name: Company, dtype: int64

# Plot a histogram of the Year Founded feature.
### YOUR CODE HERE ###

import matplotlib.pyplot as plt

# Ensure 'Year Founded' is numeric
df['Year Founded'] = pd.to_numeric(df['Year Founded'], errors='coerce')

# Plot histogram
plt.hist(df['Year Founded'].dropna(), bins=range(int(df['Year Founded'].min()), int(df['Year Founded'].max()) + 1), edgecolor='black')
plt.xlabel('Year Founded')
plt.ylabel('Number of Companies')
plt.title('Distribution of Unicorn Companies by Year Founded')
plt.xticks(rotation=45)
plt.show()

# Convert the `Date Joined` column to datetime.
# Update the column with the converted values.

### YOUR CODE HERE ###

# Convert the 'Date Joined' column to datetime
df['Date Joined'] = pd.to_datetime(df['Date Joined'], errors='coerce')

# Display the first few rows to verify
df.head()



# Display the data types of the columns in `companies`
# to confirm that the update actually took place.

### YOUR CODE HERE ###
# Display the data types of all columns
df.dtypes

Company                     object
Valuation                   object
Date Joined         datetime64[ns]
Industry                    object
City                        object
Country/Region              object
Continent                   object
Year Founded                 int64
Funding                     object
Select Investors            object
dtype: object

# Obtain the names of the months when companies gained unicorn status.

# Create 'Month Joined' column from 'Date Joined'
df['Month Joined'] = df['Date Joined'].dt.month_name()

# Display the first few rows of `companies`



# Display the first few rows to verify the new column
df.head()

# Create 'Years To Join' column by subtracting 'Year Founded' from the year of 'Date Joined'
df['Years To Join'] = df['Date Joined'].dt.year - df['Year Founded']

# Display the first few rows to verify the new column
df.head()





# Display the first few rows of `companies`
# to confirm that the new column did get added.

df.head()

# Count companies per year
year_counts = df['Year Joined'].value_counts().sort_index()

# Plot bar chart
year_counts.plot(kind='bar')
plt.xlabel('Year Joined')
plt.ylabel('Number of Companies')
plt.title('Companies Reaching Unicorn Status by Year')
plt.show()

# Count companies that joined in 2021
num_2021 = (df['Year Joined'] == 2021).sum()

print(num_2021)

520

# Filter dataset by a year of your interest (in terms of when companies reached unicorn status).
# Save the resulting subset in a new variable. 





# Display the first few rows of the subset to confirm that it was created.


# Convert Date Joined to datetime (if not already done)
df['Date Joined'] = pd.to_datetime(df['Date Joined'])

# Create Year Joined column
df['Year Joined'] = df['Date Joined'].dt.year

# Filter dataset for 2021
companies_2021 = df[df['Year Joined'] == 2021]

# Display the subset
companies_2021.head()

import matplotlib.pyplot as plt

# Filter companies that joined in 2021
companies_2021 = df[df['Date Joined'].dt.year == 2021]

# Extract month numbers
months_2021 = companies_2021['Date Joined'].dt.month

# Plot histogram
plt.hist(months_2021, bins=12)
plt.xlabel('Month (2021)')
plt.ylabel('Number of Companies')
plt.title('Distribution of Unicorn Companies by Month in 2021')
plt.show()

import pandas as pd

# Step 1: Take the subset for 2021
companies_2021 = df[df['Date Joined'].dt.year == 2021].copy()

# Add a column for the month name
companies_2021['Month Joined'] = companies_2021['Date Joined'].dt.month_name()

# Optional: if you prefer month numbers for sorting
companies_2021['Month Number'] = companies_2021['Date Joined'].dt.month

# Step 2: Group by the time interval (month) and count companies
monthly_counts = companies_2021.groupby(['Month Number', 'Month Joined'])['Company'].count().reset_index()

# Rename column for clarity
monthly_counts.rename(columns={'Company': 'Number of Companies'}, inplace=True)

# Sort by month number to ensure chronological order
monthly_counts = monthly_counts.sort_values('Month Number')

# Display the result
monthly_counts

# Display the first few rows to confirm
monthly_counts.head()

import pandas as pd

# Step 1: Filter for the additional year (2020)
companies_2020 = df[df['Date Joined'].dt.year == 2020].copy()

# Step 2: Concatenate with the previous subset (e.g., companies_2021)
combined_df = pd.concat([companies_2020, companies_2021], ignore_index=True)

# Step 3: Add a column for the month number or month name if not already present
combined_df['Month Number'] = combined_df['Date Joined'].dt.month
combined_df['Month Joined'] = combined_df['Date Joined'].dt.month_name()
combined_df['Year Joined'] = combined_df['Date Joined'].dt.year

# Step 4: Transform 'Valuation' column as needed (ensure numeric, remove $/B if necessary)
# Example: if Valuation is in string like '$1.2B'
combined_df['Valuation Numeric'] = combined_df['Valuation'].replace('[\$,B]', '', regex=True).astype(float)

# Step 5: Group by Year and Month, and compute average valuation
avg_valuation = combined_df.groupby(['Year Joined', 'Month Number', 'Month Joined'])['Valuation Numeric'] \
                           .mean() \
                           .reset_index()

# Optional: sort by year and month
avg_valuation = avg_valuation.sort_values(['Year Joined', 'Month Number'])

# Display the first few rows
avg_valuation.head()

# Display the first few rows to confirm
avg_valuation.head()

import matplotlib.pyplot as plt
import seaborn as sns

# Step 1: Define months in chronological order
months_order = ['January', 'February', 'March', 'April', 'May', 'June', 
                'July', 'August', 'September', 'October', 'November', 'December']

# Step 2: Ensure 'Month Joined' and 'Years To Join' columns exist
# Month Joined
df['Month Joined'] = df['Date Joined'].dt.month_name()
# Years To Join
df['Years To Join'] = df['Date Joined'].dt.year - df['Year Founded']

# Step 3: Create a box plot
plt.figure(figsize=(12,6))
sns.boxplot(data=df, x='Month Joined', y='Years To Join', order=months_order)
plt.xlabel('Month Joined')
plt.ylabel('Years to Become Unicorn')
plt.title('Distribution of Time Taken to Reach Unicorn Status by Month Joined')
plt.xticks(rotation=45)
plt.show()

# Print the months in chronological order
print(months_order)

['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']

import matplotlib.pyplot as plt
import seaborn as sns

# Ensure the 'Month Joined' and 'Years To Join' columns exist
df['Month Joined'] = df['Date Joined'].dt.month_name()
df['Years To Join'] = df['Date Joined'].dt.year - df['Year Founded']

# Define months in chronological order
months_order = ['January', 'February', 'March', 'April', 'May', 'June', 
                'July', 'August', 'September', 'October', 'November', 'December']

# Create the box plot
plt.figure(figsize=(12,6))
sns.boxplot(data=df, x='Month Joined', y='Years To Join', order=months_order)
plt.xlabel('Month Joined')
plt.ylabel('Years to Become Unicorn')
plt.title('Distribution of Time Taken to Reach Unicorn Status by Month Joined')
plt.xticks(rotation=45)
plt.show()

import matplotlib.pyplot as plt
import seaborn as sns

# Ensure 'Years To Join' exists
df['Years To Join'] = df['Date Joined'].dt.year - df['Year Founded']

# Group by Year Founded and calculate average Years To Join
avg_years = df.groupby('Year Founded')['Years To Join'].mean().reset_index()

# Create bar plot
plt.figure(figsize=(12,6))
sns.barplot(data=avg_years, x='Year Founded', y='Years To Join', palette='viridis')
plt.xlabel('Year Founded')
plt.ylabel('Average Years to Become Unicorn')
plt.title('Average Time to Reach Unicorn Status by Year Founded')
plt.xticks(rotation=45)
plt.show()

import matplotlib.pyplot as plt
import seaborn as sns

# Step 1: Filter for the year of interest (e.g., 2021)
companies_2021 = df[df['Date Joined'].dt.year == 2021].copy()

# Step 2: Extract month names
companies_2021['Month Joined'] = companies_2021['Date Joined'].dt.month_name()

# Step 3: Count companies per month
monthly_counts = companies_2021['Month Joined'].value_counts().reindex([
    'January', 'February', 'March', 'April', 'May', 'June', 
    'July', 'August', 'September', 'October', 'November', 'December'
])

# Step 4: Create bar plot
plt.figure(figsize=(12,6))
sns.barplot(x=monthly_counts.index, y=monthly_counts.values, palette='coolwarm')
plt.xlabel('Month Joined (2021)')
plt.ylabel('Number of Companies')
plt.title('Number of Companies Reaching Unicorn Status per Month in 2021')
plt.xticks(rotation=45)
plt.show()

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Step 1: Ensure Date Joined is datetime
df['Date Joined'] = pd.to_datetime(df['Date Joined'])

# Step 2: Filter for the years of interest, e.g., 2020 and 2021
years_of_interest = [2020, 2021]
subset_years = df[df['Date Joined'].dt.year.isin(years_of_interest)].copy()

# Step 3: Extract year and quarter
subset_years['Year Joined'] = subset_years['Date Joined'].dt.year
subset_years['Quarter Joined'] = subset_years['Date Joined'].dt.to_period('Q').dt.strftime('%Y-Q%q')

# Step 4: Transform Valuation to numeric if needed
subset_years['Valuation Numeric'] = subset_years['Valuation'].replace('[\$,B]', '', regex=True).astype(float)

# Step 5: Group by Quarter and Year, compute average valuation
avg_valuation_quarter = subset_years.groupby(['Quarter Joined', 'Year Joined'])['Valuation Numeric'].mean().reset_index()

# Step 6: Pivot for grouped bar plot
avg_valuation_pivot = avg_valuation_quarter.pivot(index='Quarter Joined', columns='Year Joined', values='Valuation Numeric')

# Step 7: Plot grouped bar chart
avg_valuation_pivot.plot(kind='bar', figsize=(12,6))
plt.xlabel('Quarter')
plt.ylabel('Average Valuation (Billion $)')
plt.title('Average Valuation by Quarter for 2020 and 2021')
plt.xticks(rotation=45)
plt.legend(title='Year')
plt.show()

	Company	Valuation	Date Joined	Industry	City	Country/Region	Continent	Year Founded	Funding	Select Investors
0	Bytedance	$180B	4/7/17	Artificial intelligence	Beijing	China	Asia	2012	$8B	Sequoia Capital China, SIG Asia Investments, S...
1	SpaceX	$100B	12/1/12	Other	Hawthorne	United States	North America	2002	$7B	Founders Fund, Draper Fisher Jurvetson, Rothen...
2	SHEIN	$100B	7/3/18	E-commerce & direct-to-consumer	Shenzhen	China	Asia	2008	$2B	Tiger Global Management, Sequoia Capital China...
3	Stripe	$95B	1/23/14	Fintech	San Francisco	United States	North America	2010	$2B	Khosla Ventures, LowercaseCapital, capitalG
4	Klarna	$46B	12/12/11	Fintech	Stockholm	Sweden	Europe	2005	$4B	Institutional Venture Partners, Sequoia Capita...

	Company	Valuation	Date Joined	Industry	City	Country/Region	Continent	Year Founded	Funding	Select Investors
0	Bytedance	$180B	4/7/17	Artificial intelligence	Beijing	China	Asia	2012	$8B	Sequoia Capital China, SIG Asia Investments, S...
1	SpaceX	$100B	12/1/12	Other	Hawthorne	United States	North America	2002	$7B	Founders Fund, Draper Fisher Jurvetson, Rothen...
2	SHEIN	$100B	7/3/18	E-commerce & direct-to-consumer	Shenzhen	China	Asia	2008	$2B	Tiger Global Management, Sequoia Capital China...
3	Stripe	$95B	1/23/14	Fintech	San Francisco	United States	North America	2010	$2B	Khosla Ventures, LowercaseCapital, capitalG
4	Klarna	$46B	12/12/11	Fintech	Stockholm	Sweden	Europe	2005	$4B	Institutional Venture Partners, Sequoia Capita...
5	Canva	$40B	1/8/18	Internet software & services	Surry Hills	Australia	Oceania	2012	$572M	Sequoia Capital China, Blackbird Ventures, Mat...
6	Checkout.com	$40B	5/2/19	Fintech	London	United Kingdom	Europe	2012	$2B	Tiger Global Management, Insight Partners, DST...
7	Instacart	$39B	12/30/14	Supply chain, logistics, & delivery	San Francisco	United States	North America	2012	$3B	Khosla Ventures, Kleiner Perkins Caufield & By...
8	JUUL Labs	$38B	12/20/17	Consumer & retail	San Francisco	United States	North America	2015	$14B	Tiger Global Management
9	Databricks	$38B	2/5/19	Data management & analytics	San Francisco	United States	North America	2013	$3B	Andreessen Horowitz, New Enterprise Associates...

	Company	Valuation	Date Joined	Industry	City	Country/Region	Continent	Year Founded	Funding	Select Investors
189	Otto Bock HealthCare	$4B	6/24/17	Health	Duderstadt	Germany	Europe	1919	$0M	EQT Partners
373	Promasidor Holdings	$2B	11/8/16	Consumer & retail	Bryanston	South Africa	Asia	1979	$556M	IFC, Ajinomoto
699	Five Star Business Finance	$1B	3/26/21	Other	Chennai	India	Asia	1984	$456M	Sequoia Capital India, Tiger Global Management...
840	Radius Payment Solutions	$1B	11/27/17	Fintech	Crewe	United Kingdom	Europe	1990	$200M	Inflexion Private Equity
11	Epic Games	$32B	10/26/18	Other	Cary	United States	North America	1991	$7B	Tencent Holdings, KKR, Smash Ventures
377	BGL Group	$2B	11/24/17	Fintech	Peterborough	United Kingdom	Europe	1992	$696M	CPP Investment Board
1043	Thirty Madison	$1B	6/2/21	Health	New York	United States	North America	1993	$210M	Northzone Ventures, Maveron, Johnson & Johnson...
118	Vice Media	$6B	4/2/11	Internet software & services	Brooklyn	United States	North America	1994	$1B	Technology Crossover Ventures, A&E Television ...
129	Howden Group Holdings	$5B	9/29/20	Other	London	United Kingdom	Europe	1994	$187M	General Atlantic, 3i Group, Huagai Capital
360	Carzone	$2B	3/1/19	E-commerce & direct-to-consumer	Jiangsu	China	Asia	1995	$946M	Alibaba Group,Co-Stone Venture Capital, Buhuo ...

	Company	Valuation	Date Joined	Industry	City	Country/Region	Continent	Year Founded	Funding	Select Investors	Month Joined	Years To Join
0	Bytedance	$180B	2017-04-07	Artificial intelligence	Beijing	China	Asia	2012	$8B	Sequoia Capital China, SIG Asia Investments, S...	April	5
1	SpaceX	$100B	2012-12-01	Other	Hawthorne	United States	North America	2002	$7B	Founders Fund, Draper Fisher Jurvetson, Rothen...	December	10
2	SHEIN	$100B	2018-07-03	E-commerce & direct-to-consumer	Shenzhen	China	Asia	2008	$2B	Tiger Global Management, Sequoia Capital China...	July	10
3	Stripe	$95B	2014-01-23	Fintech	San Francisco	United States	North America	2010	$2B	Khosla Ventures, LowercaseCapital, capitalG	January	4
4	Klarna	$46B	2011-12-12	Fintech	Stockholm	Sweden	Europe	2005	$4B	Institutional Venture Partners, Sequoia Capita...	December	6

	Company	Valuation	Date Joined	Industry	City	Country/Region	Continent	Year Founded	Funding	Select Investors	Month Joined	Years To Join
0	Bytedance	$180B	2017-04-07	Artificial intelligence	Beijing	China	Asia	2012	$8B	Sequoia Capital China, SIG Asia Investments, S...	April	5
1	SpaceX	$100B	2012-12-01	Other	Hawthorne	United States	North America	2002	$7B	Founders Fund, Draper Fisher Jurvetson, Rothen...	December	10
2	SHEIN	$100B	2018-07-03	E-commerce & direct-to-consumer	Shenzhen	China	Asia	2008	$2B	Tiger Global Management, Sequoia Capital China...	July	10
3	Stripe	$95B	2014-01-23	Fintech	San Francisco	United States	North America	2010	$2B	Khosla Ventures, LowercaseCapital, capitalG	January	4
4	Klarna	$46B	2011-12-12	Fintech	Stockholm	Sweden	Europe	2005	$4B	Institutional Venture Partners, Sequoia Capita...	December	6

	Company	Valuation	Date Joined	Industry	City	Country/Region	Continent	Year Founded	Funding	Select Investors	Month Joined	Years To Join	Year Joined
12	FTX	$32B	2021-07-20	Fintech	NaN	Bahamas	North America	2018	$2B	Sequoia Capital, Thoma Bravo, Softbank	July	3	2021
16	J&T Express	$20B	2021-04-07	Supply chain, logistics, & delivery	Jakarta	Indonesia	Asia	2015	$5B	Hillhouse Capital Management, Boyu Capital, Se...	April	6	2021
24	Blockchain.com	$14B	2021-02-17	Fintech	London	United Kingdom	Europe	2011	$490M	Lightspeed Venture Partners, Google Ventures, ...	February	10	2021
27	OpenSea	$13B	2021-07-20	E-commerce & direct-to-consumer	New York	United States	North America	2017	$427M	Andreessen Horowitz, Thirty Five Ventures, Sou...	July	4	2021
34	Getir	$12B	2021-03-26	E-commerce & direct-to-consumer	Istanbul	Turkey	Europe	2015	$2B	Tiger Global Management, Sequoia Capital, Revo...	March	6	2021

	Year Joined	Month Number	Month Joined	Valuation Numeric
0	2020	1	January	3.200000
1	2020	2	February	2.666667
2	2020	3	March	4.400000
3	2020	4	April	5.166667
4	2020	5	May	2.500000