In [1]:
# Data manipulation and analysis
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Optional: improve plot aesthetics
sns.set_style("whitegrid")
plt.rcParams["figure.figsize"] = (10, 6)

# Optional: for interactive plots
# import plotly.express as px

# Optional: ignore warnings for cleaner output
import warnings
warnings.filterwarnings("ignore")
In [2]:
# Load the dataset
df = pd.read_csv("Unicorn_Companies.csv")

# Display the first few rows to check the data
df.head()
Out[2]:
Company Valuation Date Joined Industry City Country/Region Continent Year Founded Funding Select Investors
0 Bytedance $180B 4/7/17 Artificial intelligence Beijing China Asia 2012 $8B Sequoia Capital China, SIG Asia Investments, S...
1 SpaceX $100B 12/1/12 Other Hawthorne United States North America 2002 $7B Founders Fund, Draper Fisher Jurvetson, Rothen...
2 SHEIN $100B 7/3/18 E-commerce & direct-to-consumer Shenzhen China Asia 2008 $2B Tiger Global Management, Sequoia Capital China...
3 Stripe $95B 1/23/14 Fintech San Francisco United States North America 2010 $2B Khosla Ventures, LowercaseCapital, capitalG
4 Klarna $46B 12/12/11 Fintech Stockholm Sweden Europe 2005 $4B Institutional Venture Partners, Sequoia Capita...
In [3]:
# Display the first 10 rows of the dataset
df.head(10)
Out[3]:
Company Valuation Date Joined Industry City Country/Region Continent Year Founded Funding Select Investors
0 Bytedance $180B 4/7/17 Artificial intelligence Beijing China Asia 2012 $8B Sequoia Capital China, SIG Asia Investments, S...
1 SpaceX $100B 12/1/12 Other Hawthorne United States North America 2002 $7B Founders Fund, Draper Fisher Jurvetson, Rothen...
2 SHEIN $100B 7/3/18 E-commerce & direct-to-consumer Shenzhen China Asia 2008 $2B Tiger Global Management, Sequoia Capital China...
3 Stripe $95B 1/23/14 Fintech San Francisco United States North America 2010 $2B Khosla Ventures, LowercaseCapital, capitalG
4 Klarna $46B 12/12/11 Fintech Stockholm Sweden Europe 2005 $4B Institutional Venture Partners, Sequoia Capita...
5 Canva $40B 1/8/18 Internet software & services Surry Hills Australia Oceania 2012 $572M Sequoia Capital China, Blackbird Ventures, Mat...
6 Checkout.com $40B 5/2/19 Fintech London United Kingdom Europe 2012 $2B Tiger Global Management, Insight Partners, DST...
7 Instacart $39B 12/30/14 Supply chain, logistics, & delivery San Francisco United States North America 2012 $3B Khosla Ventures, Kleiner Perkins Caufield & By...
8 JUUL Labs $38B 12/20/17 Consumer & retail San Francisco United States North America 2015 $14B Tiger Global Management
9 Databricks $38B 2/5/19 Data management & analytics San Francisco United States North America 2013 $3B Andreessen Horowitz, New Enterprise Associates...
In [4]:
# Identify the number of rows and columns in the dataset.

### YOUR CODE HERE ###

# Get the number of rows and columns
df.shape
Out[4]:
(1074, 10)
In [5]:
# Check for duplicate rows
duplicate_rows = df.duplicated()
In [6]:
df.drop_duplicates().shape
Out[6]:
(1074, 10)
In [7]:
# Display the data types of each column
df.dtypes
Out[7]:
Company             object
Valuation           object
Date Joined         object
Industry            object
City                object
Country/Region      object
Continent           object
Year Founded         int64
Funding             object
Select Investors    object
dtype: object
In [9]:
# Check the column names
print(df.columns)
Index(['Company', 'Valuation', 'Date Joined', 'Industry', 'City',
       'Country/Region', 'Continent', 'Year Founded', 'Funding',
       'Select Investors'],
      dtype='object')
In [11]:
# Sort the DataFrame by 'Year Founded' in ascending order (earliest founded first)
df_sorted = df.sort_values(by='Year Founded', ascending=True)

# Display the first 10 rows
df_sorted.head(10)
Out[11]:
Company Valuation Date Joined Industry City Country/Region Continent Year Founded Funding Select Investors
189 Otto Bock HealthCare $4B 6/24/17 Health Duderstadt Germany Europe 1919 $0M EQT Partners
373 Promasidor Holdings $2B 11/8/16 Consumer & retail Bryanston South Africa Asia 1979 $556M IFC, Ajinomoto
699 Five Star Business Finance $1B 3/26/21 Other Chennai India Asia 1984 $456M Sequoia Capital India, Tiger Global Management...
840 Radius Payment Solutions $1B 11/27/17 Fintech Crewe United Kingdom Europe 1990 $200M Inflexion Private Equity
11 Epic Games $32B 10/26/18 Other Cary United States North America 1991 $7B Tencent Holdings, KKR, Smash Ventures
377 BGL Group $2B 11/24/17 Fintech Peterborough United Kingdom Europe 1992 $696M CPP Investment Board
1043 Thirty Madison $1B 6/2/21 Health New York United States North America 1993 $210M Northzone Ventures, Maveron, Johnson & Johnson...
118 Vice Media $6B 4/2/11 Internet software & services Brooklyn United States North America 1994 $1B Technology Crossover Ventures, A&E Television ...
129 Howden Group Holdings $5B 9/29/20 Other London United Kingdom Europe 1994 $187M General Atlantic, 3i Group, Huagai Capital
360 Carzone $2B 3/1/19 E-commerce & direct-to-consumer Jiangsu China Asia 1995 $946M Alibaba Group,Co-Stone Venture Capital, Buhuo ...
In [12]:
# Display each unique year that occurs in the dataset
# along with the number of companies that were founded in each unique year.

### YOUR CODE HERE ###

companies_per_year = df.groupby('Year Founded')['Company'].count()
print(companies_per_year)
Year Founded
1919      1
1979      1
1984      1
1990      1
1991      1
1992      1
1993      1
1994      2
1995      2
1996      1
1997      1
1998      5
1999      8
2000     11
2001      9
2002      4
2003      8
2004      8
2005     14
2006     15
2007     24
2008     27
2009     34
2010     40
2011     82
2012     95
2013     87
2014    109
2015    155
2016    110
2017     74
2018     61
2019     45
2020     25
2021     11
Name: Company, dtype: int64
In [13]:
# Plot a histogram of the Year Founded feature.
### YOUR CODE HERE ###

import matplotlib.pyplot as plt

# Ensure 'Year Founded' is numeric
df['Year Founded'] = pd.to_numeric(df['Year Founded'], errors='coerce')

# Plot histogram
plt.hist(df['Year Founded'].dropna(), bins=range(int(df['Year Founded'].min()), int(df['Year Founded'].max()) + 1), edgecolor='black')
plt.xlabel('Year Founded')
plt.ylabel('Number of Companies')
plt.title('Distribution of Unicorn Companies by Year Founded')
plt.xticks(rotation=45)
plt.show()
No description has been provided for this image
In [14]:
# Convert the `Date Joined` column to datetime.
# Update the column with the converted values.

### YOUR CODE HERE ###

# Convert the 'Date Joined' column to datetime
df['Date Joined'] = pd.to_datetime(df['Date Joined'], errors='coerce')

# Display the first few rows to verify
df.head()



# Display the data types of the columns in `companies`
# to confirm that the update actually took place.

### YOUR CODE HERE ###
# Display the data types of all columns
df.dtypes
Out[14]:
Company                     object
Valuation                   object
Date Joined         datetime64[ns]
Industry                    object
City                        object
Country/Region              object
Continent                   object
Year Founded                 int64
Funding                     object
Select Investors            object
dtype: object
In [18]:
# Obtain the names of the months when companies gained unicorn status.

# Create 'Month Joined' column from 'Date Joined'
df['Month Joined'] = df['Date Joined'].dt.month_name()

# Display the first few rows of `companies`



# Display the first few rows to verify the new column
df.head()
Out[18]:
Company Valuation Date Joined Industry City Country/Region Continent Year Founded Funding Select Investors Month Joined Years To Join
0 Bytedance $180B 2017-04-07 Artificial intelligence Beijing China Asia 2012 $8B Sequoia Capital China, SIG Asia Investments, S... April 5
1 SpaceX $100B 2012-12-01 Other Hawthorne United States North America 2002 $7B Founders Fund, Draper Fisher Jurvetson, Rothen... December 10
2 SHEIN $100B 2018-07-03 E-commerce & direct-to-consumer Shenzhen China Asia 2008 $2B Tiger Global Management, Sequoia Capital China... July 10
3 Stripe $95B 2014-01-23 Fintech San Francisco United States North America 2010 $2B Khosla Ventures, LowercaseCapital, capitalG January 4
4 Klarna $46B 2011-12-12 Fintech Stockholm Sweden Europe 2005 $4B Institutional Venture Partners, Sequoia Capita... December 6
In [17]:
# Create 'Years To Join' column by subtracting 'Year Founded' from the year of 'Date Joined'
df['Years To Join'] = df['Date Joined'].dt.year - df['Year Founded']

# Display the first few rows to verify the new column
df.head()





# Display the first few rows of `companies`
# to confirm that the new column did get added.

df.head()
Out[17]:
Company Valuation Date Joined Industry City Country/Region Continent Year Founded Funding Select Investors Month Joined Years To Join
0 Bytedance $180B 2017-04-07 Artificial intelligence Beijing China Asia 2012 $8B Sequoia Capital China, SIG Asia Investments, S... April 5
1 SpaceX $100B 2012-12-01 Other Hawthorne United States North America 2002 $7B Founders Fund, Draper Fisher Jurvetson, Rothen... December 10
2 SHEIN $100B 2018-07-03 E-commerce & direct-to-consumer Shenzhen China Asia 2008 $2B Tiger Global Management, Sequoia Capital China... July 10
3 Stripe $95B 2014-01-23 Fintech San Francisco United States North America 2010 $2B Khosla Ventures, LowercaseCapital, capitalG January 4
4 Klarna $46B 2011-12-12 Fintech Stockholm Sweden Europe 2005 $4B Institutional Venture Partners, Sequoia Capita... December 6
In [20]:
# Count companies per year
year_counts = df['Year Joined'].value_counts().sort_index()

# Plot bar chart
year_counts.plot(kind='bar')
plt.xlabel('Year Joined')
plt.ylabel('Number of Companies')
plt.title('Companies Reaching Unicorn Status by Year')
plt.show()
No description has been provided for this image
In [21]:
# Count companies that joined in 2021
num_2021 = (df['Year Joined'] == 2021).sum()

print(num_2021)
520
In [22]:
# Filter dataset by a year of your interest (in terms of when companies reached unicorn status).
# Save the resulting subset in a new variable. 





# Display the first few rows of the subset to confirm that it was created.


# Convert Date Joined to datetime (if not already done)
df['Date Joined'] = pd.to_datetime(df['Date Joined'])

# Create Year Joined column
df['Year Joined'] = df['Date Joined'].dt.year

# Filter dataset for 2021
companies_2021 = df[df['Year Joined'] == 2021]

# Display the subset
companies_2021.head()
Out[22]:
Company Valuation Date Joined Industry City Country/Region Continent Year Founded Funding Select Investors Month Joined Years To Join Year Joined
12 FTX $32B 2021-07-20 Fintech NaN Bahamas North America 2018 $2B Sequoia Capital, Thoma Bravo, Softbank July 3 2021
16 J&T Express $20B 2021-04-07 Supply chain, logistics, & delivery Jakarta Indonesia Asia 2015 $5B Hillhouse Capital Management, Boyu Capital, Se... April 6 2021
24 Blockchain.com $14B 2021-02-17 Fintech London United Kingdom Europe 2011 $490M Lightspeed Venture Partners, Google Ventures, ... February 10 2021
27 OpenSea $13B 2021-07-20 E-commerce & direct-to-consumer New York United States North America 2017 $427M Andreessen Horowitz, Thirty Five Ventures, Sou... July 4 2021
34 Getir $12B 2021-03-26 E-commerce & direct-to-consumer Istanbul Turkey Europe 2015 $2B Tiger Global Management, Sequoia Capital, Revo... March 6 2021
In [25]:
import matplotlib.pyplot as plt

# Filter companies that joined in 2021
companies_2021 = df[df['Date Joined'].dt.year == 2021]

# Extract month numbers
months_2021 = companies_2021['Date Joined'].dt.month

# Plot histogram
plt.hist(months_2021, bins=12)
plt.xlabel('Month (2021)')
plt.ylabel('Number of Companies')
plt.title('Distribution of Unicorn Companies by Month in 2021')
plt.show()
No description has been provided for this image
In [26]:
import pandas as pd

# Step 1: Take the subset for 2021
companies_2021 = df[df['Date Joined'].dt.year == 2021].copy()

# Add a column for the month name
companies_2021['Month Joined'] = companies_2021['Date Joined'].dt.month_name()

# Optional: if you prefer month numbers for sorting
companies_2021['Month Number'] = companies_2021['Date Joined'].dt.month

# Step 2: Group by the time interval (month) and count companies
monthly_counts = companies_2021.groupby(['Month Number', 'Month Joined'])['Company'].count().reset_index()

# Rename column for clarity
monthly_counts.rename(columns={'Company': 'Number of Companies'}, inplace=True)

# Sort by month number to ensure chronological order
monthly_counts = monthly_counts.sort_values('Month Number')

# Display the result
monthly_counts
Out[26]:
Month Number Month Joined Number of Companies
0 1 January 34
1 2 February 22
2 3 March 52
3 4 April 43
4 5 May 46
5 6 June 55
6 7 July 50
7 8 August 32
8 9 September 52
9 10 October 45
10 11 November 47
11 12 December 42
In [27]:
# Display the first few rows to confirm
monthly_counts.head()
Out[27]:
Month Number Month Joined Number of Companies
0 1 January 34
1 2 February 22
2 3 March 52
3 4 April 43
4 5 May 46
In [28]:
import pandas as pd

# Step 1: Filter for the additional year (2020)
companies_2020 = df[df['Date Joined'].dt.year == 2020].copy()

# Step 2: Concatenate with the previous subset (e.g., companies_2021)
combined_df = pd.concat([companies_2020, companies_2021], ignore_index=True)

# Step 3: Add a column for the month number or month name if not already present
combined_df['Month Number'] = combined_df['Date Joined'].dt.month
combined_df['Month Joined'] = combined_df['Date Joined'].dt.month_name()
combined_df['Year Joined'] = combined_df['Date Joined'].dt.year

# Step 4: Transform 'Valuation' column as needed (ensure numeric, remove $/B if necessary)
# Example: if Valuation is in string like '$1.2B'
combined_df['Valuation Numeric'] = combined_df['Valuation'].replace('[\$,B]', '', regex=True).astype(float)

# Step 5: Group by Year and Month, and compute average valuation
avg_valuation = combined_df.groupby(['Year Joined', 'Month Number', 'Month Joined'])['Valuation Numeric'] \
                           .mean() \
                           .reset_index()

# Optional: sort by year and month
avg_valuation = avg_valuation.sort_values(['Year Joined', 'Month Number'])

# Display the first few rows
avg_valuation.head()
Out[28]:
Year Joined Month Number Month Joined Valuation Numeric
0 2020 1 January 3.200000
1 2020 2 February 2.666667
2 2020 3 March 4.400000
3 2020 4 April 5.166667
4 2020 5 May 2.500000
In [29]:
# Display the first few rows to confirm
avg_valuation.head()
Out[29]:
Year Joined Month Number Month Joined Valuation Numeric
0 2020 1 January 3.200000
1 2020 2 February 2.666667
2 2020 3 March 4.400000
3 2020 4 April 5.166667
4 2020 5 May 2.500000
In [30]:
import matplotlib.pyplot as plt
import seaborn as sns

# Step 1: Define months in chronological order
months_order = ['January', 'February', 'March', 'April', 'May', 'June', 
                'July', 'August', 'September', 'October', 'November', 'December']

# Step 2: Ensure 'Month Joined' and 'Years To Join' columns exist
# Month Joined
df['Month Joined'] = df['Date Joined'].dt.month_name()
# Years To Join
df['Years To Join'] = df['Date Joined'].dt.year - df['Year Founded']

# Step 3: Create a box plot
plt.figure(figsize=(12,6))
sns.boxplot(data=df, x='Month Joined', y='Years To Join', order=months_order)
plt.xlabel('Month Joined')
plt.ylabel('Years to Become Unicorn')
plt.title('Distribution of Time Taken to Reach Unicorn Status by Month Joined')
plt.xticks(rotation=45)
plt.show()
No description has been provided for this image
In [31]:
# Print the months in chronological order
print(months_order)
['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
In [32]:
import matplotlib.pyplot as plt
import seaborn as sns

# Ensure the 'Month Joined' and 'Years To Join' columns exist
df['Month Joined'] = df['Date Joined'].dt.month_name()
df['Years To Join'] = df['Date Joined'].dt.year - df['Year Founded']

# Define months in chronological order
months_order = ['January', 'February', 'March', 'April', 'May', 'June', 
                'July', 'August', 'September', 'October', 'November', 'December']

# Create the box plot
plt.figure(figsize=(12,6))
sns.boxplot(data=df, x='Month Joined', y='Years To Join', order=months_order)
plt.xlabel('Month Joined')
plt.ylabel('Years to Become Unicorn')
plt.title('Distribution of Time Taken to Reach Unicorn Status by Month Joined')
plt.xticks(rotation=45)
plt.show()
No description has been provided for this image
In [33]:
import matplotlib.pyplot as plt
import seaborn as sns

# Ensure 'Years To Join' exists
df['Years To Join'] = df['Date Joined'].dt.year - df['Year Founded']

# Group by Year Founded and calculate average Years To Join
avg_years = df.groupby('Year Founded')['Years To Join'].mean().reset_index()

# Create bar plot
plt.figure(figsize=(12,6))
sns.barplot(data=avg_years, x='Year Founded', y='Years To Join', palette='viridis')
plt.xlabel('Year Founded')
plt.ylabel('Average Years to Become Unicorn')
plt.title('Average Time to Reach Unicorn Status by Year Founded')
plt.xticks(rotation=45)
plt.show()
No description has been provided for this image
In [34]:
import matplotlib.pyplot as plt
import seaborn as sns

# Step 1: Filter for the year of interest (e.g., 2021)
companies_2021 = df[df['Date Joined'].dt.year == 2021].copy()

# Step 2: Extract month names
companies_2021['Month Joined'] = companies_2021['Date Joined'].dt.month_name()

# Step 3: Count companies per month
monthly_counts = companies_2021['Month Joined'].value_counts().reindex([
    'January', 'February', 'March', 'April', 'May', 'June', 
    'July', 'August', 'September', 'October', 'November', 'December'
])

# Step 4: Create bar plot
plt.figure(figsize=(12,6))
sns.barplot(x=monthly_counts.index, y=monthly_counts.values, palette='coolwarm')
plt.xlabel('Month Joined (2021)')
plt.ylabel('Number of Companies')
plt.title('Number of Companies Reaching Unicorn Status per Month in 2021')
plt.xticks(rotation=45)
plt.show()
No description has been provided for this image
In [35]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Step 1: Ensure Date Joined is datetime
df['Date Joined'] = pd.to_datetime(df['Date Joined'])

# Step 2: Filter for the years of interest, e.g., 2020 and 2021
years_of_interest = [2020, 2021]
subset_years = df[df['Date Joined'].dt.year.isin(years_of_interest)].copy()

# Step 3: Extract year and quarter
subset_years['Year Joined'] = subset_years['Date Joined'].dt.year
subset_years['Quarter Joined'] = subset_years['Date Joined'].dt.to_period('Q').dt.strftime('%Y-Q%q')

# Step 4: Transform Valuation to numeric if needed
subset_years['Valuation Numeric'] = subset_years['Valuation'].replace('[\$,B]', '', regex=True).astype(float)

# Step 5: Group by Quarter and Year, compute average valuation
avg_valuation_quarter = subset_years.groupby(['Quarter Joined', 'Year Joined'])['Valuation Numeric'].mean().reset_index()

# Step 6: Pivot for grouped bar plot
avg_valuation_pivot = avg_valuation_quarter.pivot(index='Quarter Joined', columns='Year Joined', values='Valuation Numeric')

# Step 7: Plot grouped bar chart
avg_valuation_pivot.plot(kind='bar', figsize=(12,6))
plt.xlabel('Quarter')
plt.ylabel('Average Valuation (Billion $)')
plt.title('Average Valuation by Quarter for 2020 and 2021')
plt.xticks(rotation=45)
plt.legend(title='Year')
plt.show()
No description has been provided for this image
In [ ]: