In [70]:
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
In [71]:
df = pd.read_csv('Unicorn_Companies.csv')
In [72]:
df.head(10)
Out[72]:
Company Valuation Date Joined Industry City Country/Region Continent Year Founded Funding Select Investors
0 Bytedance $180B 4/7/17 Artificial intelligence Beijing China Asia 2012 $8B Sequoia Capital China, SIG Asia Investments, S...
1 SpaceX $100B 12/1/12 Other Hawthorne United States North America 2002 $7B Founders Fund, Draper Fisher Jurvetson, Rothen...
2 SHEIN $100B 7/3/18 E-commerce & direct-to-consumer Shenzhen China Asia 2008 $2B Tiger Global Management, Sequoia Capital China...
3 Stripe $95B 1/23/14 Fintech San Francisco United States North America 2010 $2B Khosla Ventures, LowercaseCapital, capitalG
4 Klarna $46B 12/12/11 Fintech Stockholm Sweden Europe 2005 $4B Institutional Venture Partners, Sequoia Capita...
5 Canva $40B 1/8/18 Internet software & services Surry Hills Australia Oceania 2012 $572M Sequoia Capital China, Blackbird Ventures, Mat...
6 Checkout.com $40B 5/2/19 Fintech London United Kingdom Europe 2012 $2B Tiger Global Management, Insight Partners, DST...
7 Instacart $39B 12/30/14 Supply chain, logistics, & delivery San Francisco United States North America 2012 $3B Khosla Ventures, Kleiner Perkins Caufield & By...
8 JUUL Labs $38B 12/20/17 Consumer & retail San Francisco United States North America 2015 $14B Tiger Global Management
9 Databricks $38B 2/5/19 Data management & analytics San Francisco United States North America 2013 $3B Andreessen Horowitz, New Enterprise Associates...
In [73]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1074 entries, 0 to 1073
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Company           1074 non-null   object
 1   Valuation         1074 non-null   object
 2   Date Joined       1074 non-null   object
 3   Industry          1074 non-null   object
 4   City              1058 non-null   object
 5   Country/Region    1074 non-null   object
 6   Continent         1074 non-null   object
 7   Year Founded      1074 non-null   int64 
 8   Funding           1074 non-null   object
 9   Select Investors  1073 non-null   object
dtypes: int64(1), object(9)
memory usage: 84.0+ KB
In [74]:
df.describe()
Out[74]:
Year Founded
count 1074.000000
mean 2012.895717
std 5.698573
min 1919.000000
25% 2011.000000
50% 2014.000000
75% 2016.000000
max 2021.000000
In [75]:
df.size
Out[75]:
10740
In [76]:
df.shape
Out[76]:
(1074, 10)
In [79]:
df['Date Joined'] = pd.to_datetime(df['Date Joined'])
In [82]:
df['year_joined'] = df['Date Joined'].dt.year
df['month_joined'] = df['Date Joined'].dt.month
df['day_joined'] = df['Date Joined'].dt.day

df.drop(columns=['year', 'month', 'day'], inplace=True, errors='ignore')
In [83]:
df[['Date Joined', 'year_joined', 'month_joined', 'day_joined']].head()
Out[83]:
Date Joined year_joined month_joined day_joined
0 2017-04-07 2017 4 7
1 2012-12-01 2012 12 1
2 2018-07-03 2018 7 3
3 2014-01-23 2014 1 23
4 2011-12-12 2011 12 12
In [85]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1074 entries, 0 to 1073
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   Company           1074 non-null   object        
 1   Valuation         1074 non-null   object        
 2   Date Joined       1074 non-null   datetime64[ns]
 3   Industry          1074 non-null   object        
 4   City              1058 non-null   object        
 5   Country/Region    1074 non-null   object        
 6   Continent         1074 non-null   object        
 7   Year Founded      1074 non-null   int64         
 8   Funding           1074 non-null   object        
 9   Select Investors  1073 non-null   object        
 10  year_joined       1074 non-null   int32         
 11  month_joined      1074 non-null   int32         
 12  day_joined        1074 non-null   int32         
dtypes: datetime64[ns](1), int32(3), int64(1), object(8)
memory usage: 96.6+ KB
In [86]:
companies_sampled = df.sample(n=50, random_state=42)
companies_sampled.head()
Out[86]:
Company Valuation Date Joined Industry City Country/Region Continent Year Founded Funding Select Investors year_joined month_joined day_joined
542 Aiven $2B 2021-10-18 Internet software & services Helsinki Finland Europe 2016 $210M Institutional Venture Partners, Atomico, Early... 2021 10 18
370 Jusfoun Big Data $2B 2018-07-09 Data management & analytics Beijing China Asia 2010 $137M Boxin Capital, DT Capital Partners, IDG Capital 2018 7 9
307 Innovaccer $3B 2021-02-19 Health San Francisco United States North America 2014 $379M M12, WestBridge Capital, Lightspeed Venture Pa... 2021 2 19
493 Algolia $2B 2021-07-28 Internet software & services San Francisco United States North America 2012 $334M Accel, Alven Capital, Storm Ventures 2021 7 28
350 SouChe Holdings $3B 2017-11-01 E-commerce & direct-to-consumer Hangzhou China Asia 2012 $1B Morningside Ventures, Warburg Pincus, CreditEa... 2017 11 1
In [87]:
df.columns
Out[87]:
Index(['Company', 'Valuation', 'Date Joined', 'Industry', 'City',
       'Country/Region', 'Continent', 'Year Founded', 'Funding',
       'Select Investors', 'year_joined', 'month_joined', 'day_joined'],
      dtype='object')
In [92]:
companies_sampled.columns = companies_sampled.columns.str.lower().str.replace(' ', '_')
In [93]:
import matplotlib.pyplot as plt

# Create a new column for time to unicorn
companies_sampled['years_to_unicorn'] = (
    companies_sampled['year_joined'] - companies_sampled['year_founded']
)

# Find the longest time to unicorn status for each industry
industry_max = (
    companies_sampled.groupby('industry')['years_to_unicorn']
    .max()
    .sort_values(ascending=False)
)

# Create bar chart
plt.figure()
plt.bar(industry_max.index, industry_max.values)

plt.title('Longest Time to Reach Unicorn Status by Industry')
plt.xlabel('Industry')
plt.ylabel('Years to Unicorn Status')
plt.xticks(rotation=45)

plt.show()
No description has been provided for this image
In [94]:
import matplotlib.pyplot as plt

# Ensure 'years_to_unicorn' exists
companies_sampled['years_to_unicorn'] = (
    companies_sampled['year_joined'] - companies_sampled['year_founded']
)

# Sort by years_to_unicorn if you want the longest at the front
companies_sampled_sorted = companies_sampled.sort_values('years_to_unicorn', ascending=False).head(20)  # optional: top 20

# Plot bar chart
plt.figure(figsize=(12,6))
plt.bar(companies_sampled_sorted['company'], companies_sampled_sorted['years_to_unicorn'], color='skyblue')

plt.title('Time to Reach Unicorn Status for Companies')
plt.xlabel('Company')
plt.ylabel('Years to Unicorn Status')
plt.xticks(rotation=45, ha='right')

plt.tight_layout()
plt.show()
No description has been provided for this image
In [95]:
import matplotlib.pyplot as plt
import pandas as pd

# Step 1: Convert valuation column to numeric
# Assume the column is named 'valuation' and looks like "$1.2B", "$800M", etc.

def valuation_to_number(val):
    """
    Convert valuation strings like '$1.2B', '$800M' to numeric values in dollars.
    """
    if pd.isnull(val):
        return 0
    val = val.replace('$', '').replace(',', '').upper()
    if 'B' in val:
        return float(val.replace('B','')) * 1e9
    elif 'M' in val:
        return float(val.replace('M','')) * 1e6
    else:
        return float(val)

# Create a new numeric column
companies_sampled['valuation_num'] = companies_sampled['valuation'].apply(valuation_to_number)

# Step 2: Find maximum valuation per industry
industry_max_valuation = companies_sampled.groupby('industry')['valuation_num'].max().sort_values(ascending=False)

# Step 3: Plot bar chart
plt.figure(figsize=(12,6))
plt.bar(industry_max_valuation.index, industry_max_valuation.values, color='skyblue')

plt.title('Maximum Unicorn Valuation by Industry')
plt.xlabel('Industry')
plt.ylabel('Maximum Valuation (USD)')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()
No description has been provided for this image
In [ ]: