In [70]:
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
In [71]:
df = pd.read_csv('Unicorn_Companies.csv')
In [72]:
df.head(10)
Out[72]:
| Company | Valuation | Date Joined | Industry | City | Country/Region | Continent | Year Founded | Funding | Select Investors | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Bytedance | $180B | 4/7/17 | Artificial intelligence | Beijing | China | Asia | 2012 | $8B | Sequoia Capital China, SIG Asia Investments, S... |
| 1 | SpaceX | $100B | 12/1/12 | Other | Hawthorne | United States | North America | 2002 | $7B | Founders Fund, Draper Fisher Jurvetson, Rothen... |
| 2 | SHEIN | $100B | 7/3/18 | E-commerce & direct-to-consumer | Shenzhen | China | Asia | 2008 | $2B | Tiger Global Management, Sequoia Capital China... |
| 3 | Stripe | $95B | 1/23/14 | Fintech | San Francisco | United States | North America | 2010 | $2B | Khosla Ventures, LowercaseCapital, capitalG |
| 4 | Klarna | $46B | 12/12/11 | Fintech | Stockholm | Sweden | Europe | 2005 | $4B | Institutional Venture Partners, Sequoia Capita... |
| 5 | Canva | $40B | 1/8/18 | Internet software & services | Surry Hills | Australia | Oceania | 2012 | $572M | Sequoia Capital China, Blackbird Ventures, Mat... |
| 6 | Checkout.com | $40B | 5/2/19 | Fintech | London | United Kingdom | Europe | 2012 | $2B | Tiger Global Management, Insight Partners, DST... |
| 7 | Instacart | $39B | 12/30/14 | Supply chain, logistics, & delivery | San Francisco | United States | North America | 2012 | $3B | Khosla Ventures, Kleiner Perkins Caufield & By... |
| 8 | JUUL Labs | $38B | 12/20/17 | Consumer & retail | San Francisco | United States | North America | 2015 | $14B | Tiger Global Management |
| 9 | Databricks | $38B | 2/5/19 | Data management & analytics | San Francisco | United States | North America | 2013 | $3B | Andreessen Horowitz, New Enterprise Associates... |
In [73]:
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1074 entries, 0 to 1073 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Company 1074 non-null object 1 Valuation 1074 non-null object 2 Date Joined 1074 non-null object 3 Industry 1074 non-null object 4 City 1058 non-null object 5 Country/Region 1074 non-null object 6 Continent 1074 non-null object 7 Year Founded 1074 non-null int64 8 Funding 1074 non-null object 9 Select Investors 1073 non-null object dtypes: int64(1), object(9) memory usage: 84.0+ KB
In [74]:
df.describe()
Out[74]:
| Year Founded | |
|---|---|
| count | 1074.000000 |
| mean | 2012.895717 |
| std | 5.698573 |
| min | 1919.000000 |
| 25% | 2011.000000 |
| 50% | 2014.000000 |
| 75% | 2016.000000 |
| max | 2021.000000 |
In [75]:
df.size
Out[75]:
10740
In [76]:
df.shape
Out[76]:
(1074, 10)
In [79]:
df['Date Joined'] = pd.to_datetime(df['Date Joined'])
In [82]:
df['year_joined'] = df['Date Joined'].dt.year
df['month_joined'] = df['Date Joined'].dt.month
df['day_joined'] = df['Date Joined'].dt.day
df.drop(columns=['year', 'month', 'day'], inplace=True, errors='ignore')
In [83]:
df[['Date Joined', 'year_joined', 'month_joined', 'day_joined']].head()
Out[83]:
| Date Joined | year_joined | month_joined | day_joined | |
|---|---|---|---|---|
| 0 | 2017-04-07 | 2017 | 4 | 7 |
| 1 | 2012-12-01 | 2012 | 12 | 1 |
| 2 | 2018-07-03 | 2018 | 7 | 3 |
| 3 | 2014-01-23 | 2014 | 1 | 23 |
| 4 | 2011-12-12 | 2011 | 12 | 12 |
In [85]:
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1074 entries, 0 to 1073 Data columns (total 13 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Company 1074 non-null object 1 Valuation 1074 non-null object 2 Date Joined 1074 non-null datetime64[ns] 3 Industry 1074 non-null object 4 City 1058 non-null object 5 Country/Region 1074 non-null object 6 Continent 1074 non-null object 7 Year Founded 1074 non-null int64 8 Funding 1074 non-null object 9 Select Investors 1073 non-null object 10 year_joined 1074 non-null int32 11 month_joined 1074 non-null int32 12 day_joined 1074 non-null int32 dtypes: datetime64[ns](1), int32(3), int64(1), object(8) memory usage: 96.6+ KB
In [86]:
companies_sampled = df.sample(n=50, random_state=42)
companies_sampled.head()
Out[86]:
| Company | Valuation | Date Joined | Industry | City | Country/Region | Continent | Year Founded | Funding | Select Investors | year_joined | month_joined | day_joined | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 542 | Aiven | $2B | 2021-10-18 | Internet software & services | Helsinki | Finland | Europe | 2016 | $210M | Institutional Venture Partners, Atomico, Early... | 2021 | 10 | 18 |
| 370 | Jusfoun Big Data | $2B | 2018-07-09 | Data management & analytics | Beijing | China | Asia | 2010 | $137M | Boxin Capital, DT Capital Partners, IDG Capital | 2018 | 7 | 9 |
| 307 | Innovaccer | $3B | 2021-02-19 | Health | San Francisco | United States | North America | 2014 | $379M | M12, WestBridge Capital, Lightspeed Venture Pa... | 2021 | 2 | 19 |
| 493 | Algolia | $2B | 2021-07-28 | Internet software & services | San Francisco | United States | North America | 2012 | $334M | Accel, Alven Capital, Storm Ventures | 2021 | 7 | 28 |
| 350 | SouChe Holdings | $3B | 2017-11-01 | E-commerce & direct-to-consumer | Hangzhou | China | Asia | 2012 | $1B | Morningside Ventures, Warburg Pincus, CreditEa... | 2017 | 11 | 1 |
In [87]:
df.columns
Out[87]:
Index(['Company', 'Valuation', 'Date Joined', 'Industry', 'City',
'Country/Region', 'Continent', 'Year Founded', 'Funding',
'Select Investors', 'year_joined', 'month_joined', 'day_joined'],
dtype='object')
In [92]:
companies_sampled.columns = companies_sampled.columns.str.lower().str.replace(' ', '_')
In [93]:
import matplotlib.pyplot as plt
# Create a new column for time to unicorn
companies_sampled['years_to_unicorn'] = (
companies_sampled['year_joined'] - companies_sampled['year_founded']
)
# Find the longest time to unicorn status for each industry
industry_max = (
companies_sampled.groupby('industry')['years_to_unicorn']
.max()
.sort_values(ascending=False)
)
# Create bar chart
plt.figure()
plt.bar(industry_max.index, industry_max.values)
plt.title('Longest Time to Reach Unicorn Status by Industry')
plt.xlabel('Industry')
plt.ylabel('Years to Unicorn Status')
plt.xticks(rotation=45)
plt.show()
In [94]:
import matplotlib.pyplot as plt
# Ensure 'years_to_unicorn' exists
companies_sampled['years_to_unicorn'] = (
companies_sampled['year_joined'] - companies_sampled['year_founded']
)
# Sort by years_to_unicorn if you want the longest at the front
companies_sampled_sorted = companies_sampled.sort_values('years_to_unicorn', ascending=False).head(20) # optional: top 20
# Plot bar chart
plt.figure(figsize=(12,6))
plt.bar(companies_sampled_sorted['company'], companies_sampled_sorted['years_to_unicorn'], color='skyblue')
plt.title('Time to Reach Unicorn Status for Companies')
plt.xlabel('Company')
plt.ylabel('Years to Unicorn Status')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()
In [95]:
import matplotlib.pyplot as plt
import pandas as pd
# Step 1: Convert valuation column to numeric
# Assume the column is named 'valuation' and looks like "$1.2B", "$800M", etc.
def valuation_to_number(val):
"""
Convert valuation strings like '$1.2B', '$800M' to numeric values in dollars.
"""
if pd.isnull(val):
return 0
val = val.replace('$', '').replace(',', '').upper()
if 'B' in val:
return float(val.replace('B','')) * 1e9
elif 'M' in val:
return float(val.replace('M','')) * 1e6
else:
return float(val)
# Create a new numeric column
companies_sampled['valuation_num'] = companies_sampled['valuation'].apply(valuation_to_number)
# Step 2: Find maximum valuation per industry
industry_max_valuation = companies_sampled.groupby('industry')['valuation_num'].max().sort_values(ascending=False)
# Step 3: Plot bar chart
plt.figure(figsize=(12,6))
plt.bar(industry_max_valuation.index, industry_max_valuation.values, color='skyblue')
plt.title('Maximum Unicorn Valuation by Industry')
plt.xlabel('Industry')
plt.ylabel('Maximum Valuation (USD)')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()
In [ ]: