import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt

df = pd.read_csv('Unicorn_Companies.csv')

df.head(10)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1074 entries, 0 to 1073
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Company           1074 non-null   object
 1   Valuation         1074 non-null   object
 2   Date Joined       1074 non-null   object
 3   Industry          1074 non-null   object
 4   City              1058 non-null   object
 5   Country/Region    1074 non-null   object
 6   Continent         1074 non-null   object
 7   Year Founded      1074 non-null   int64 
 8   Funding           1074 non-null   object
 9   Select Investors  1073 non-null   object
dtypes: int64(1), object(9)
memory usage: 84.0+ KB

df.describe()

df.size

10740

df.shape

(1074, 10)

df['Date Joined'] = pd.to_datetime(df['Date Joined'])

df['year_joined'] = df['Date Joined'].dt.year
df['month_joined'] = df['Date Joined'].dt.month
df['day_joined'] = df['Date Joined'].dt.day

df.drop(columns=['year', 'month', 'day'], inplace=True, errors='ignore')

df[['Date Joined', 'year_joined', 'month_joined', 'day_joined']].head()

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1074 entries, 0 to 1073
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   Company           1074 non-null   object        
 1   Valuation         1074 non-null   object        
 2   Date Joined       1074 non-null   datetime64[ns]
 3   Industry          1074 non-null   object        
 4   City              1058 non-null   object        
 5   Country/Region    1074 non-null   object        
 6   Continent         1074 non-null   object        
 7   Year Founded      1074 non-null   int64         
 8   Funding           1074 non-null   object        
 9   Select Investors  1073 non-null   object        
 10  year_joined       1074 non-null   int32         
 11  month_joined      1074 non-null   int32         
 12  day_joined        1074 non-null   int32         
dtypes: datetime64[ns](1), int32(3), int64(1), object(8)
memory usage: 96.6+ KB

companies_sampled = df.sample(n=50, random_state=42)
companies_sampled.head()

df.columns

Index(['Company', 'Valuation', 'Date Joined', 'Industry', 'City',
       'Country/Region', 'Continent', 'Year Founded', 'Funding',
       'Select Investors', 'year_joined', 'month_joined', 'day_joined'],
      dtype='object')

companies_sampled.columns = companies_sampled.columns.str.lower().str.replace(' ', '_')

import matplotlib.pyplot as plt

# Create a new column for time to unicorn
companies_sampled['years_to_unicorn'] = (
    companies_sampled['year_joined'] - companies_sampled['year_founded']
)

# Find the longest time to unicorn status for each industry
industry_max = (
    companies_sampled.groupby('industry')['years_to_unicorn']
    .max()
    .sort_values(ascending=False)
)

# Create bar chart
plt.figure()
plt.bar(industry_max.index, industry_max.values)

plt.title('Longest Time to Reach Unicorn Status by Industry')
plt.xlabel('Industry')
plt.ylabel('Years to Unicorn Status')
plt.xticks(rotation=45)

plt.show()

import matplotlib.pyplot as plt

# Ensure 'years_to_unicorn' exists
companies_sampled['years_to_unicorn'] = (
    companies_sampled['year_joined'] - companies_sampled['year_founded']
)

# Sort by years_to_unicorn if you want the longest at the front
companies_sampled_sorted = companies_sampled.sort_values('years_to_unicorn', ascending=False).head(20)  # optional: top 20

# Plot bar chart
plt.figure(figsize=(12,6))
plt.bar(companies_sampled_sorted['company'], companies_sampled_sorted['years_to_unicorn'], color='skyblue')

plt.title('Time to Reach Unicorn Status for Companies')
plt.xlabel('Company')
plt.ylabel('Years to Unicorn Status')
plt.xticks(rotation=45, ha='right')

plt.tight_layout()
plt.show()

import matplotlib.pyplot as plt
import pandas as pd

# Step 1: Convert valuation column to numeric
# Assume the column is named 'valuation' and looks like "$1.2B", "$800M", etc.

def valuation_to_number(val):
    """
    Convert valuation strings like '$1.2B', '$800M' to numeric values in dollars.
    """
    if pd.isnull(val):
        return 0
    val = val.replace('$', '').replace(',', '').upper()
    if 'B' in val:
        return float(val.replace('B','')) * 1e9
    elif 'M' in val:
        return float(val.replace('M','')) * 1e6
    else:
        return float(val)

# Create a new numeric column
companies_sampled['valuation_num'] = companies_sampled['valuation'].apply(valuation_to_number)

# Step 2: Find maximum valuation per industry
industry_max_valuation = companies_sampled.groupby('industry')['valuation_num'].max().sort_values(ascending=False)

# Step 3: Plot bar chart
plt.figure(figsize=(12,6))
plt.bar(industry_max_valuation.index, industry_max_valuation.values, color='skyblue')

plt.title('Maximum Unicorn Valuation by Industry')
plt.xlabel('Industry')
plt.ylabel('Maximum Valuation (USD)')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

	Company	Valuation	Date Joined	Industry	City	Country/Region	Continent	Year Founded	Funding	Select Investors
0	Bytedance	$180B	4/7/17	Artificial intelligence	Beijing	China	Asia	2012	$8B	Sequoia Capital China, SIG Asia Investments, S...
1	SpaceX	$100B	12/1/12	Other	Hawthorne	United States	North America	2002	$7B	Founders Fund, Draper Fisher Jurvetson, Rothen...
2	SHEIN	$100B	7/3/18	E-commerce & direct-to-consumer	Shenzhen	China	Asia	2008	$2B	Tiger Global Management, Sequoia Capital China...
3	Stripe	$95B	1/23/14	Fintech	San Francisco	United States	North America	2010	$2B	Khosla Ventures, LowercaseCapital, capitalG
4	Klarna	$46B	12/12/11	Fintech	Stockholm	Sweden	Europe	2005	$4B	Institutional Venture Partners, Sequoia Capita...
5	Canva	$40B	1/8/18	Internet software & services	Surry Hills	Australia	Oceania	2012	$572M	Sequoia Capital China, Blackbird Ventures, Mat...
6	Checkout.com	$40B	5/2/19	Fintech	London	United Kingdom	Europe	2012	$2B	Tiger Global Management, Insight Partners, DST...
7	Instacart	$39B	12/30/14	Supply chain, logistics, & delivery	San Francisco	United States	North America	2012	$3B	Khosla Ventures, Kleiner Perkins Caufield & By...
8	JUUL Labs	$38B	12/20/17	Consumer & retail	San Francisco	United States	North America	2015	$14B	Tiger Global Management
9	Databricks	$38B	2/5/19	Data management & analytics	San Francisco	United States	North America	2013	$3B	Andreessen Horowitz, New Enterprise Associates...

	Year Founded
count	1074.000000
mean	2012.895717
std	5.698573
min	1919.000000
25%	2011.000000
50%	2014.000000
75%	2016.000000
max	2021.000000

	Date Joined	year_joined	month_joined	day_joined
0	2017-04-07	2017	4	7
1	2012-12-01	2012	12	1
2	2018-07-03	2018	7	3
3	2014-01-23	2014	1	23
4	2011-12-12	2011	12	12

	Company	Valuation	Date Joined	Industry	City	Country/Region	Continent	Year Founded	Funding	Select Investors	year_joined	month_joined	day_joined
542	Aiven	$2B	2021-10-18	Internet software & services	Helsinki	Finland	Europe	2016	$210M	Institutional Venture Partners, Atomico, Early...	2021	10	18
370	Jusfoun Big Data	$2B	2018-07-09	Data management & analytics	Beijing	China	Asia	2010	$137M	Boxin Capital, DT Capital Partners, IDG Capital	2018	7	9
307	Innovaccer	$3B	2021-02-19	Health	San Francisco	United States	North America	2014	$379M	M12, WestBridge Capital, Lightspeed Venture Pa...	2021	2	19
493	Algolia	$2B	2021-07-28	Internet software & services	San Francisco	United States	North America	2012	$334M	Accel, Alven Capital, Storm Ventures	2021	7	28
350	SouChe Holdings	$3B	2017-11-01	E-commerce & direct-to-consumer	Hangzhou	China	Asia	2012	$1B	Morningside Ventures, Warburg Pincus, CreditEa...	2017	11	1