The Series Data Structure¶

import pandas as pd
pd.Series?

animals = ['Tiger', 'Bear', 'Moose']
pd.Series(animals)

numbers = [1, 2, 3]
pd.Series(numbers)

animals = ['Tiger', 'Bear', None]
pd.Series(animals)

numbers = [1, 2, None]
pd.Series(numbers)

import numpy as np
np.nan == None

np.nan == np.nan

np.isnan(np.nan)

sports = {'Archery': 'Bhutan',
          'Golf': 'Scotland',
          'Sumo': 'Japan',
          'Taekwondo': 'South Korea'}
s = pd.Series(sports)
s

Archery           Bhutan
Golf            Scotland
Sumo               Japan
Taekwondo    South Korea
dtype: object

s.index

s = pd.Series(['Tiger', 'Bear', 'Moose'], index=['India', 'America', 'Canada'])
s

sports = {'Archery': 'Bhutan',
          'Golf': 'Scotland',
          'Sumo': 'Japan',
          'Taekwondo': 'South Korea'}
s = pd.Series(sports, index=['Golf', 'Sumo', 'Hockey'])
s

Golf      Scotland
Sumo         Japan
Hockey         NaN
dtype: object

Querying a Series¶

sports = {'Archery': 'Bhutan',
          'Golf': 'Scotland',
          'Sumo': 'Japan',
          'Taekwondo': 'South Korea'}
s = pd.Series(sports)
s

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-1-322d7d6ab47b> in <module>()
      3           'Sumo': 'Japan',
      4           'Taekwondo': 'South Korea'}
----> 5 s = pd.Series(sports)
      6 s

NameError: name 'pd' is not defined

s.iloc[3]

s.loc['Golf']

s[3]

s['Golf']

sports = {99: 'Bhutan',
          100: 'Scotland',
          101: 'Japan',
          102: 'South Korea'}
s = pd.Series(sports)

s[0] #This won't call s.iloc[0] as one might expect, it generates an error instead

s = pd.Series([100.00, 120.00, 101.00, 3.00])
s

total = 0
for item in s:
    total+=item
print(total)

import numpy as np

total = np.sum(s)
print(total)

#this creates a big series of random numbers
s = pd.Series(np.random.randint(0,1000,10000))
s.head()

len(s)

%%timeit -n 100
summary = 0
for item in s:
    summary+=item

%%timeit -n 100
summary = np.sum(s)

s+=2 #adds two to each item in s using broadcasting
s.head()

for label, value in s.iteritems():
    s.set_value(label, value+2)
s.head()

%%timeit -n 10
s = pd.Series(np.random.randint(0,1000,10000))
for label, value in s.iteritems():
    s.loc[label]= value+2

%%timeit -n 10
s = pd.Series(np.random.randint(0,1000,10000))
s+=2

s = pd.Series([1, 2, 3])
s.loc['Animal'] = 'Bears'
s

original_sports = pd.Series({'Archery': 'Bhutan',
                             'Golf': 'Scotland',
                             'Sumo': 'Japan',
                             'Taekwondo': 'South Korea'})
cricket_loving_countries = pd.Series(['Australia',
                                      'Barbados',
                                      'Pakistan',
                                      'England'], 
                                   index=['Cricket',
                                          'Cricket',
                                          'Cricket',
                                          'Cricket'])
all_countries = original_sports.append(cricket_loving_countries)

original_sports

cricket_loving_countries

all_countries

all_countries.loc['Cricket']

The DataFrame Data Structure¶

import pandas as pd
purchase_1 = pd.Series({'Name': 'Chris',
                        'Item Purchased': 'Dog Food',
                        'Cost': 22.50})
purchase_2 = pd.Series({'Name': 'Kevyn',
                        'Item Purchased': 'Kitty Litter',
                        'Cost': 2.50})
purchase_3 = pd.Series({'Name': 'Vinod',
                        'Item Purchased': 'Bird Seed',
                        'Cost': 5.00})
df = pd.DataFrame([purchase_1, purchase_2, purchase_3], index=['Store 1', 'Store 1', 'Store 2'])
df.head()

df.loc['Store 2']

Cost                      5
Item Purchased    Bird Seed
Name                  Vinod
Name: Store 2, dtype: object

type(df.loc['Store 2'])

df.loc['Store 1']

df.loc['Store 1', 'Cost']

Store 1    22.5
Store 1     2.5
Name: Cost, dtype: float64

df.T

df.T.loc['Cost']

Store 1    22.5
Store 1     2.5
Store 2       5
Name: Cost, dtype: object

df['Cost']

df.loc['Store 1']['Cost']

df.loc[:,['Name', 'Cost']]

df.drop('Store 1')

df

copy_df = df.copy()
copy_df = copy_df.drop('Store 1')
copy_df

copy_df.drop?

del copy_df['Name']
copy_df

df['Location'] = None
df['Location'][2]=6
df

/home/sabodhapati/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

Dataframe Indexing and Loading¶

costs = df['Cost']
costs

costs+=2
costs

df

!cat olympics.csv

df = pd.read_csv('olympics.csv')
df.head()

df = pd.read_csv('olympics.csv', index_col = 0, skiprows=1)
df.head()

df.columns

for col in df.columns:
    if col[:2]=='01':
        df.rename(columns={col:'Gold' + col[4:]}, inplace=True)
    if col[:2]=='02':
        df.rename(columns={col:'Silver' + col[4:]}, inplace=True)
    if col[:2]=='03':
        df.rename(columns={col:'Bronze' + col[4:]}, inplace=True)
    if col[:1]=='№':
        df.rename(columns={col:'#' + col[1:]}, inplace=True) 

df.head()

Querying a DataFrame¶

df['Gold'] > 0

only_gold = df.where(df['Gold'] > 0)
only_gold.head()

only_gold['Gold'].count()

df['Gold'].count()

only_gold = only_gold.dropna()
only_gold.head()

only_gold = df[df['Gold'] > 0]
only_gold.head()

len(df[(df['Gold'] > 0) | (df['Gold.1'] > 0)])

df[(df['Gold.1'] > 0) & (df['Gold'] == 0)]

Indexing Dataframes¶

df.head()

df['country'] = df.index
df = df.set_index('Gold')
df.head()

df = df.reset_index()
df.head()

df = pd.read_csv('./Data_Science_with_Python/course1_downloads/census.csv')
df.head()

df['SUMLEV'].unique()

array([40, 50])

df=df[df['SUMLEV'] == 50]
df.head()

columns_to_keep = ['STNAME',
                   'CTYNAME',
                   'BIRTHS2010',
                   'BIRTHS2011',
                   'BIRTHS2012',
                   'BIRTHS2013',
                   'BIRTHS2014',
                   'BIRTHS2015',
                   'POPESTIMATE2010',
                   'POPESTIMATE2011',
                   'POPESTIMATE2012',
                   'POPESTIMATE2013',
                   'POPESTIMATE2014',
                   'POPESTIMATE2015']
df = df[columns_to_keep]
df.head()

df = df.set_index(['STNAME', 'CTYNAME'])
df.head()

df.loc['Michigan', 'Washtenaw County']

/home/sabodhapati/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:1: PerformanceWarning: indexing past lexsort depth may impact performance.
  """Entry point for launching an IPython kernel.

df.loc[ [('Michigan', 'Washtenaw County'),
         ('Michigan', 'Wayne County')] ]

Missing values¶

df = pd.read_csv('log.csv')
df

df.fillna?

df = df.set_index('time')
df = df.sort_index()
df

df = df.reset_index()
df = df.set_index(['time', 'user'])
df

df = df.fillna(method='ffill')
df.head()

	SUMLEV	REGION	DIVISION	STATE	COUNTY	STNAME	CTYNAME	CENSUS2010POP	ESTIMATESBASE2010	POPESTIMATE2010	...	RDOMESTICMIG2011	RDOMESTICMIG2012	RDOMESTICMIG2013	RDOMESTICMIG2014	RDOMESTICMIG2015	RNETMIG2011	RNETMIG2012	RNETMIG2013	RNETMIG2014	RNETMIG2015
0	40	3	6	1	0	Alabama	Alabama	4779736	4780127	4785161	...	0.002295	-0.193196	0.381066	0.582002	-0.467369	1.030015	0.826644	1.383282	1.724718	0.712594
1	50	3	6	1	1	Alabama	Autauga County	54571	54571	54660	...	7.242091	-2.915927	-3.012349	2.265971	-2.530799	7.606016	-2.626146	-2.722002	2.592270	-2.187333
2	50	3	6	1	3	Alabama	Baldwin County	182265	182265	183193	...	14.832960	17.647293	21.845705	19.243287	17.197872	15.844176	18.559627	22.727626	20.317142	18.293499
3	50	3	6	1	5	Alabama	Barbour County	27457	27457	27341	...	-4.728132	-2.500690	-7.056824	-3.904217	-10.543299	-4.874741	-2.758113	-7.167664	-3.978583	-10.543299
4	50	3	6	1	7	Alabama	Bibb County	22915	22919	22861	...	-5.527043	-5.068871	-6.201001	-0.177537	0.177258	-5.088389	-4.363636	-5.403729	0.754533	1.107861

	STNAME	CTYNAME	BIRTHS2010	BIRTHS2011	BIRTHS2012	BIRTHS2013	BIRTHS2014	BIRTHS2015	POPESTIMATE2010	POPESTIMATE2011	POPESTIMATE2012	POPESTIMATE2013	POPESTIMATE2014	POPESTIMATE2015
0	Alabama	Alabama	14226	59689	59062	57938	58334	58305	4785161	4801108	4816089	4830533	4846411	4858979
1	Alabama	Autauga County	151	636	615	574	623	600	54660	55253	55175	55038	55290	55347
2	Alabama	Baldwin County	517	2187	2092	2160	2186	2240	183193	186659	190396	195126	199713	203709
3	Alabama	Barbour County	70	335	300	283	260	269	27341	27226	27159	26973	26815	26489
4	Alabama	Bibb County	44	266	245	259	247	253	22861	22733	22642	22512	22549	22583

		BIRTHS2010	BIRTHS2011	BIRTHS2012	BIRTHS2013	BIRTHS2014	BIRTHS2015	POPESTIMATE2010	POPESTIMATE2011	POPESTIMATE2012	POPESTIMATE2013	POPESTIMATE2014	POPESTIMATE2015
STNAME	CTYNAME
Alabama	Alabama	14226	59689	59062	57938	58334	58305	4785161	4801108	4816089	4830533	4846411	4858979
	Autauga County	151	636	615	574	623	600	54660	55253	55175	55038	55290	55347
	Baldwin County	517	2187	2092	2160	2186	2240	183193	186659	190396	195126	199713	203709
	Barbour County	70	335	300	283	260	269	27341	27226	27159	26973	26815	26489
	Bibb County	44	266	245	259	247	253	22861	22733	22642	22512	22549	22583

		BIRTHS2010	BIRTHS2011	BIRTHS2012	BIRTHS2013	BIRTHS2014	BIRTHS2015	POPESTIMATE2010	POPESTIMATE2011	POPESTIMATE2012	POPESTIMATE2013	POPESTIMATE2014	POPESTIMATE2015
STNAME	CTYNAME
Michigan	Washtenaw County	977	3826	3780	3662	3683	3709	345563	349048	351213	354289	357029	358880
Michigan	Wayne County	5918	23819	23270	23377	23607	23586	1815199	1801273	1792514	1775713	1766008	1759335

	Cost	Item Purchased	Name
Store 1	22.5	Dog Food	Chris
Store 1	2.5	Kitty Litter	Kevyn
Store 2	5.0	Bird Seed	Vinod