In [1]:
# Data manipulation and analysis
import pandas as pd
import numpy as np
# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
# Optional: improve plot aesthetics
sns.set_style("whitegrid")
plt.rcParams["figure.figsize"] = (10, 6)
# Optional: for interactive plots
# import plotly.express as px
# Optional: ignore warnings for cleaner output
import warnings
warnings.filterwarnings("ignore")
In [2]:
# Load the dataset
df = pd.read_csv("Unicorn_Companies.csv")
# Display the first few rows to check the data
df.head()
Out[2]:
| Company | Valuation | Date Joined | Industry | City | Country/Region | Continent | Year Founded | Funding | Select Investors | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Bytedance | $180B | 4/7/17 | Artificial intelligence | Beijing | China | Asia | 2012 | $8B | Sequoia Capital China, SIG Asia Investments, S... |
| 1 | SpaceX | $100B | 12/1/12 | Other | Hawthorne | United States | North America | 2002 | $7B | Founders Fund, Draper Fisher Jurvetson, Rothen... |
| 2 | SHEIN | $100B | 7/3/18 | E-commerce & direct-to-consumer | Shenzhen | China | Asia | 2008 | $2B | Tiger Global Management, Sequoia Capital China... |
| 3 | Stripe | $95B | 1/23/14 | Fintech | San Francisco | United States | North America | 2010 | $2B | Khosla Ventures, LowercaseCapital, capitalG |
| 4 | Klarna | $46B | 12/12/11 | Fintech | Stockholm | Sweden | Europe | 2005 | $4B | Institutional Venture Partners, Sequoia Capita... |
In [3]:
# Display the first 10 rows of the dataset
df.head(10)
Out[3]:
| Company | Valuation | Date Joined | Industry | City | Country/Region | Continent | Year Founded | Funding | Select Investors | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Bytedance | $180B | 4/7/17 | Artificial intelligence | Beijing | China | Asia | 2012 | $8B | Sequoia Capital China, SIG Asia Investments, S... |
| 1 | SpaceX | $100B | 12/1/12 | Other | Hawthorne | United States | North America | 2002 | $7B | Founders Fund, Draper Fisher Jurvetson, Rothen... |
| 2 | SHEIN | $100B | 7/3/18 | E-commerce & direct-to-consumer | Shenzhen | China | Asia | 2008 | $2B | Tiger Global Management, Sequoia Capital China... |
| 3 | Stripe | $95B | 1/23/14 | Fintech | San Francisco | United States | North America | 2010 | $2B | Khosla Ventures, LowercaseCapital, capitalG |
| 4 | Klarna | $46B | 12/12/11 | Fintech | Stockholm | Sweden | Europe | 2005 | $4B | Institutional Venture Partners, Sequoia Capita... |
| 5 | Canva | $40B | 1/8/18 | Internet software & services | Surry Hills | Australia | Oceania | 2012 | $572M | Sequoia Capital China, Blackbird Ventures, Mat... |
| 6 | Checkout.com | $40B | 5/2/19 | Fintech | London | United Kingdom | Europe | 2012 | $2B | Tiger Global Management, Insight Partners, DST... |
| 7 | Instacart | $39B | 12/30/14 | Supply chain, logistics, & delivery | San Francisco | United States | North America | 2012 | $3B | Khosla Ventures, Kleiner Perkins Caufield & By... |
| 8 | JUUL Labs | $38B | 12/20/17 | Consumer & retail | San Francisco | United States | North America | 2015 | $14B | Tiger Global Management |
| 9 | Databricks | $38B | 2/5/19 | Data management & analytics | San Francisco | United States | North America | 2013 | $3B | Andreessen Horowitz, New Enterprise Associates... |
In [4]:
# Identify the number of rows and columns in the dataset.
### YOUR CODE HERE ###
# Get the number of rows and columns
df.shape
Out[4]:
(1074, 10)
In [5]:
# Check for duplicate rows
duplicate_rows = df.duplicated()
In [6]:
df.drop_duplicates().shape
Out[6]:
(1074, 10)
In [7]:
# Display the data types of each column
df.dtypes
Out[7]:
Company object Valuation object Date Joined object Industry object City object Country/Region object Continent object Year Founded int64 Funding object Select Investors object dtype: object
In [9]:
# Check the column names
print(df.columns)
Index(['Company', 'Valuation', 'Date Joined', 'Industry', 'City',
'Country/Region', 'Continent', 'Year Founded', 'Funding',
'Select Investors'],
dtype='object')
In [11]:
# Sort the DataFrame by 'Year Founded' in ascending order (earliest founded first)
df_sorted = df.sort_values(by='Year Founded', ascending=True)
# Display the first 10 rows
df_sorted.head(10)
Out[11]:
| Company | Valuation | Date Joined | Industry | City | Country/Region | Continent | Year Founded | Funding | Select Investors | |
|---|---|---|---|---|---|---|---|---|---|---|
| 189 | Otto Bock HealthCare | $4B | 6/24/17 | Health | Duderstadt | Germany | Europe | 1919 | $0M | EQT Partners |
| 373 | Promasidor Holdings | $2B | 11/8/16 | Consumer & retail | Bryanston | South Africa | Asia | 1979 | $556M | IFC, Ajinomoto |
| 699 | Five Star Business Finance | $1B | 3/26/21 | Other | Chennai | India | Asia | 1984 | $456M | Sequoia Capital India, Tiger Global Management... |
| 840 | Radius Payment Solutions | $1B | 11/27/17 | Fintech | Crewe | United Kingdom | Europe | 1990 | $200M | Inflexion Private Equity |
| 11 | Epic Games | $32B | 10/26/18 | Other | Cary | United States | North America | 1991 | $7B | Tencent Holdings, KKR, Smash Ventures |
| 377 | BGL Group | $2B | 11/24/17 | Fintech | Peterborough | United Kingdom | Europe | 1992 | $696M | CPP Investment Board |
| 1043 | Thirty Madison | $1B | 6/2/21 | Health | New York | United States | North America | 1993 | $210M | Northzone Ventures, Maveron, Johnson & Johnson... |
| 118 | Vice Media | $6B | 4/2/11 | Internet software & services | Brooklyn | United States | North America | 1994 | $1B | Technology Crossover Ventures, A&E Television ... |
| 129 | Howden Group Holdings | $5B | 9/29/20 | Other | London | United Kingdom | Europe | 1994 | $187M | General Atlantic, 3i Group, Huagai Capital |
| 360 | Carzone | $2B | 3/1/19 | E-commerce & direct-to-consumer | Jiangsu | China | Asia | 1995 | $946M | Alibaba Group,Co-Stone Venture Capital, Buhuo ... |
In [12]:
# Display each unique year that occurs in the dataset
# along with the number of companies that were founded in each unique year.
### YOUR CODE HERE ###
companies_per_year = df.groupby('Year Founded')['Company'].count()
print(companies_per_year)
Year Founded 1919 1 1979 1 1984 1 1990 1 1991 1 1992 1 1993 1 1994 2 1995 2 1996 1 1997 1 1998 5 1999 8 2000 11 2001 9 2002 4 2003 8 2004 8 2005 14 2006 15 2007 24 2008 27 2009 34 2010 40 2011 82 2012 95 2013 87 2014 109 2015 155 2016 110 2017 74 2018 61 2019 45 2020 25 2021 11 Name: Company, dtype: int64
In [13]:
# Plot a histogram of the Year Founded feature.
### YOUR CODE HERE ###
import matplotlib.pyplot as plt
# Ensure 'Year Founded' is numeric
df['Year Founded'] = pd.to_numeric(df['Year Founded'], errors='coerce')
# Plot histogram
plt.hist(df['Year Founded'].dropna(), bins=range(int(df['Year Founded'].min()), int(df['Year Founded'].max()) + 1), edgecolor='black')
plt.xlabel('Year Founded')
plt.ylabel('Number of Companies')
plt.title('Distribution of Unicorn Companies by Year Founded')
plt.xticks(rotation=45)
plt.show()
In [14]:
# Convert the `Date Joined` column to datetime.
# Update the column with the converted values.
### YOUR CODE HERE ###
# Convert the 'Date Joined' column to datetime
df['Date Joined'] = pd.to_datetime(df['Date Joined'], errors='coerce')
# Display the first few rows to verify
df.head()
# Display the data types of the columns in `companies`
# to confirm that the update actually took place.
### YOUR CODE HERE ###
# Display the data types of all columns
df.dtypes
Out[14]:
Company object Valuation object Date Joined datetime64[ns] Industry object City object Country/Region object Continent object Year Founded int64 Funding object Select Investors object dtype: object
In [18]:
# Obtain the names of the months when companies gained unicorn status.
# Create 'Month Joined' column from 'Date Joined'
df['Month Joined'] = df['Date Joined'].dt.month_name()
# Display the first few rows of `companies`
# Display the first few rows to verify the new column
df.head()
Out[18]:
| Company | Valuation | Date Joined | Industry | City | Country/Region | Continent | Year Founded | Funding | Select Investors | Month Joined | Years To Join | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Bytedance | $180B | 2017-04-07 | Artificial intelligence | Beijing | China | Asia | 2012 | $8B | Sequoia Capital China, SIG Asia Investments, S... | April | 5 |
| 1 | SpaceX | $100B | 2012-12-01 | Other | Hawthorne | United States | North America | 2002 | $7B | Founders Fund, Draper Fisher Jurvetson, Rothen... | December | 10 |
| 2 | SHEIN | $100B | 2018-07-03 | E-commerce & direct-to-consumer | Shenzhen | China | Asia | 2008 | $2B | Tiger Global Management, Sequoia Capital China... | July | 10 |
| 3 | Stripe | $95B | 2014-01-23 | Fintech | San Francisco | United States | North America | 2010 | $2B | Khosla Ventures, LowercaseCapital, capitalG | January | 4 |
| 4 | Klarna | $46B | 2011-12-12 | Fintech | Stockholm | Sweden | Europe | 2005 | $4B | Institutional Venture Partners, Sequoia Capita... | December | 6 |
In [17]:
# Create 'Years To Join' column by subtracting 'Year Founded' from the year of 'Date Joined'
df['Years To Join'] = df['Date Joined'].dt.year - df['Year Founded']
# Display the first few rows to verify the new column
df.head()
# Display the first few rows of `companies`
# to confirm that the new column did get added.
df.head()
Out[17]:
| Company | Valuation | Date Joined | Industry | City | Country/Region | Continent | Year Founded | Funding | Select Investors | Month Joined | Years To Join | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Bytedance | $180B | 2017-04-07 | Artificial intelligence | Beijing | China | Asia | 2012 | $8B | Sequoia Capital China, SIG Asia Investments, S... | April | 5 |
| 1 | SpaceX | $100B | 2012-12-01 | Other | Hawthorne | United States | North America | 2002 | $7B | Founders Fund, Draper Fisher Jurvetson, Rothen... | December | 10 |
| 2 | SHEIN | $100B | 2018-07-03 | E-commerce & direct-to-consumer | Shenzhen | China | Asia | 2008 | $2B | Tiger Global Management, Sequoia Capital China... | July | 10 |
| 3 | Stripe | $95B | 2014-01-23 | Fintech | San Francisco | United States | North America | 2010 | $2B | Khosla Ventures, LowercaseCapital, capitalG | January | 4 |
| 4 | Klarna | $46B | 2011-12-12 | Fintech | Stockholm | Sweden | Europe | 2005 | $4B | Institutional Venture Partners, Sequoia Capita... | December | 6 |
In [20]:
# Count companies per year
year_counts = df['Year Joined'].value_counts().sort_index()
# Plot bar chart
year_counts.plot(kind='bar')
plt.xlabel('Year Joined')
plt.ylabel('Number of Companies')
plt.title('Companies Reaching Unicorn Status by Year')
plt.show()
In [21]:
# Count companies that joined in 2021
num_2021 = (df['Year Joined'] == 2021).sum()
print(num_2021)
520
In [22]:
# Filter dataset by a year of your interest (in terms of when companies reached unicorn status).
# Save the resulting subset in a new variable.
# Display the first few rows of the subset to confirm that it was created.
# Convert Date Joined to datetime (if not already done)
df['Date Joined'] = pd.to_datetime(df['Date Joined'])
# Create Year Joined column
df['Year Joined'] = df['Date Joined'].dt.year
# Filter dataset for 2021
companies_2021 = df[df['Year Joined'] == 2021]
# Display the subset
companies_2021.head()
Out[22]:
| Company | Valuation | Date Joined | Industry | City | Country/Region | Continent | Year Founded | Funding | Select Investors | Month Joined | Years To Join | Year Joined | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 12 | FTX | $32B | 2021-07-20 | Fintech | NaN | Bahamas | North America | 2018 | $2B | Sequoia Capital, Thoma Bravo, Softbank | July | 3 | 2021 |
| 16 | J&T Express | $20B | 2021-04-07 | Supply chain, logistics, & delivery | Jakarta | Indonesia | Asia | 2015 | $5B | Hillhouse Capital Management, Boyu Capital, Se... | April | 6 | 2021 |
| 24 | Blockchain.com | $14B | 2021-02-17 | Fintech | London | United Kingdom | Europe | 2011 | $490M | Lightspeed Venture Partners, Google Ventures, ... | February | 10 | 2021 |
| 27 | OpenSea | $13B | 2021-07-20 | E-commerce & direct-to-consumer | New York | United States | North America | 2017 | $427M | Andreessen Horowitz, Thirty Five Ventures, Sou... | July | 4 | 2021 |
| 34 | Getir | $12B | 2021-03-26 | E-commerce & direct-to-consumer | Istanbul | Turkey | Europe | 2015 | $2B | Tiger Global Management, Sequoia Capital, Revo... | March | 6 | 2021 |
In [25]:
import matplotlib.pyplot as plt
# Filter companies that joined in 2021
companies_2021 = df[df['Date Joined'].dt.year == 2021]
# Extract month numbers
months_2021 = companies_2021['Date Joined'].dt.month
# Plot histogram
plt.hist(months_2021, bins=12)
plt.xlabel('Month (2021)')
plt.ylabel('Number of Companies')
plt.title('Distribution of Unicorn Companies by Month in 2021')
plt.show()
In [26]:
import pandas as pd
# Step 1: Take the subset for 2021
companies_2021 = df[df['Date Joined'].dt.year == 2021].copy()
# Add a column for the month name
companies_2021['Month Joined'] = companies_2021['Date Joined'].dt.month_name()
# Optional: if you prefer month numbers for sorting
companies_2021['Month Number'] = companies_2021['Date Joined'].dt.month
# Step 2: Group by the time interval (month) and count companies
monthly_counts = companies_2021.groupby(['Month Number', 'Month Joined'])['Company'].count().reset_index()
# Rename column for clarity
monthly_counts.rename(columns={'Company': 'Number of Companies'}, inplace=True)
# Sort by month number to ensure chronological order
monthly_counts = monthly_counts.sort_values('Month Number')
# Display the result
monthly_counts
Out[26]:
| Month Number | Month Joined | Number of Companies | |
|---|---|---|---|
| 0 | 1 | January | 34 |
| 1 | 2 | February | 22 |
| 2 | 3 | March | 52 |
| 3 | 4 | April | 43 |
| 4 | 5 | May | 46 |
| 5 | 6 | June | 55 |
| 6 | 7 | July | 50 |
| 7 | 8 | August | 32 |
| 8 | 9 | September | 52 |
| 9 | 10 | October | 45 |
| 10 | 11 | November | 47 |
| 11 | 12 | December | 42 |
In [27]:
# Display the first few rows to confirm
monthly_counts.head()
Out[27]:
| Month Number | Month Joined | Number of Companies | |
|---|---|---|---|
| 0 | 1 | January | 34 |
| 1 | 2 | February | 22 |
| 2 | 3 | March | 52 |
| 3 | 4 | April | 43 |
| 4 | 5 | May | 46 |
In [28]:
import pandas as pd
# Step 1: Filter for the additional year (2020)
companies_2020 = df[df['Date Joined'].dt.year == 2020].copy()
# Step 2: Concatenate with the previous subset (e.g., companies_2021)
combined_df = pd.concat([companies_2020, companies_2021], ignore_index=True)
# Step 3: Add a column for the month number or month name if not already present
combined_df['Month Number'] = combined_df['Date Joined'].dt.month
combined_df['Month Joined'] = combined_df['Date Joined'].dt.month_name()
combined_df['Year Joined'] = combined_df['Date Joined'].dt.year
# Step 4: Transform 'Valuation' column as needed (ensure numeric, remove $/B if necessary)
# Example: if Valuation is in string like '$1.2B'
combined_df['Valuation Numeric'] = combined_df['Valuation'].replace('[\$,B]', '', regex=True).astype(float)
# Step 5: Group by Year and Month, and compute average valuation
avg_valuation = combined_df.groupby(['Year Joined', 'Month Number', 'Month Joined'])['Valuation Numeric'] \
.mean() \
.reset_index()
# Optional: sort by year and month
avg_valuation = avg_valuation.sort_values(['Year Joined', 'Month Number'])
# Display the first few rows
avg_valuation.head()
Out[28]:
| Year Joined | Month Number | Month Joined | Valuation Numeric | |
|---|---|---|---|---|
| 0 | 2020 | 1 | January | 3.200000 |
| 1 | 2020 | 2 | February | 2.666667 |
| 2 | 2020 | 3 | March | 4.400000 |
| 3 | 2020 | 4 | April | 5.166667 |
| 4 | 2020 | 5 | May | 2.500000 |
In [29]:
# Display the first few rows to confirm
avg_valuation.head()
Out[29]:
| Year Joined | Month Number | Month Joined | Valuation Numeric | |
|---|---|---|---|---|
| 0 | 2020 | 1 | January | 3.200000 |
| 1 | 2020 | 2 | February | 2.666667 |
| 2 | 2020 | 3 | March | 4.400000 |
| 3 | 2020 | 4 | April | 5.166667 |
| 4 | 2020 | 5 | May | 2.500000 |
In [30]:
import matplotlib.pyplot as plt
import seaborn as sns
# Step 1: Define months in chronological order
months_order = ['January', 'February', 'March', 'April', 'May', 'June',
'July', 'August', 'September', 'October', 'November', 'December']
# Step 2: Ensure 'Month Joined' and 'Years To Join' columns exist
# Month Joined
df['Month Joined'] = df['Date Joined'].dt.month_name()
# Years To Join
df['Years To Join'] = df['Date Joined'].dt.year - df['Year Founded']
# Step 3: Create a box plot
plt.figure(figsize=(12,6))
sns.boxplot(data=df, x='Month Joined', y='Years To Join', order=months_order)
plt.xlabel('Month Joined')
plt.ylabel('Years to Become Unicorn')
plt.title('Distribution of Time Taken to Reach Unicorn Status by Month Joined')
plt.xticks(rotation=45)
plt.show()
In [31]:
# Print the months in chronological order
print(months_order)
['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
In [32]:
import matplotlib.pyplot as plt
import seaborn as sns
# Ensure the 'Month Joined' and 'Years To Join' columns exist
df['Month Joined'] = df['Date Joined'].dt.month_name()
df['Years To Join'] = df['Date Joined'].dt.year - df['Year Founded']
# Define months in chronological order
months_order = ['January', 'February', 'March', 'April', 'May', 'June',
'July', 'August', 'September', 'October', 'November', 'December']
# Create the box plot
plt.figure(figsize=(12,6))
sns.boxplot(data=df, x='Month Joined', y='Years To Join', order=months_order)
plt.xlabel('Month Joined')
plt.ylabel('Years to Become Unicorn')
plt.title('Distribution of Time Taken to Reach Unicorn Status by Month Joined')
plt.xticks(rotation=45)
plt.show()
In [33]:
import matplotlib.pyplot as plt
import seaborn as sns
# Ensure 'Years To Join' exists
df['Years To Join'] = df['Date Joined'].dt.year - df['Year Founded']
# Group by Year Founded and calculate average Years To Join
avg_years = df.groupby('Year Founded')['Years To Join'].mean().reset_index()
# Create bar plot
plt.figure(figsize=(12,6))
sns.barplot(data=avg_years, x='Year Founded', y='Years To Join', palette='viridis')
plt.xlabel('Year Founded')
plt.ylabel('Average Years to Become Unicorn')
plt.title('Average Time to Reach Unicorn Status by Year Founded')
plt.xticks(rotation=45)
plt.show()
In [34]:
import matplotlib.pyplot as plt
import seaborn as sns
# Step 1: Filter for the year of interest (e.g., 2021)
companies_2021 = df[df['Date Joined'].dt.year == 2021].copy()
# Step 2: Extract month names
companies_2021['Month Joined'] = companies_2021['Date Joined'].dt.month_name()
# Step 3: Count companies per month
monthly_counts = companies_2021['Month Joined'].value_counts().reindex([
'January', 'February', 'March', 'April', 'May', 'June',
'July', 'August', 'September', 'October', 'November', 'December'
])
# Step 4: Create bar plot
plt.figure(figsize=(12,6))
sns.barplot(x=monthly_counts.index, y=monthly_counts.values, palette='coolwarm')
plt.xlabel('Month Joined (2021)')
plt.ylabel('Number of Companies')
plt.title('Number of Companies Reaching Unicorn Status per Month in 2021')
plt.xticks(rotation=45)
plt.show()
In [35]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Step 1: Ensure Date Joined is datetime
df['Date Joined'] = pd.to_datetime(df['Date Joined'])
# Step 2: Filter for the years of interest, e.g., 2020 and 2021
years_of_interest = [2020, 2021]
subset_years = df[df['Date Joined'].dt.year.isin(years_of_interest)].copy()
# Step 3: Extract year and quarter
subset_years['Year Joined'] = subset_years['Date Joined'].dt.year
subset_years['Quarter Joined'] = subset_years['Date Joined'].dt.to_period('Q').dt.strftime('%Y-Q%q')
# Step 4: Transform Valuation to numeric if needed
subset_years['Valuation Numeric'] = subset_years['Valuation'].replace('[\$,B]', '', regex=True).astype(float)
# Step 5: Group by Quarter and Year, compute average valuation
avg_valuation_quarter = subset_years.groupby(['Quarter Joined', 'Year Joined'])['Valuation Numeric'].mean().reset_index()
# Step 6: Pivot for grouped bar plot
avg_valuation_pivot = avg_valuation_quarter.pivot(index='Quarter Joined', columns='Year Joined', values='Valuation Numeric')
# Step 7: Plot grouped bar chart
avg_valuation_pivot.plot(kind='bar', figsize=(12,6))
plt.xlabel('Quarter')
plt.ylabel('Average Valuation (Billion $)')
plt.title('Average Valuation by Quarter for 2020 and 2021')
plt.xticks(rotation=45)
plt.legend(title='Year')
plt.show()
In [ ]: