The Series Data Structure

In [2]:
import pandas as pd
pd.Series?
In [ ]:
animals = ['Tiger', 'Bear', 'Moose']
pd.Series(animals)
In [ ]:
numbers = [1, 2, 3]
pd.Series(numbers)
In [ ]:
animals = ['Tiger', 'Bear', None]
pd.Series(animals)
In [ ]:
numbers = [1, 2, None]
pd.Series(numbers)
In [ ]:
import numpy as np
np.nan == None
In [ ]:
np.nan == np.nan
In [ ]:
np.isnan(np.nan)
In [4]:
sports = {'Archery': 'Bhutan',
          'Golf': 'Scotland',
          'Sumo': 'Japan',
          'Taekwondo': 'South Korea'}
s = pd.Series(sports)
s
Out[4]:
Archery           Bhutan
Golf            Scotland
Sumo               Japan
Taekwondo    South Korea
dtype: object
In [ ]:
s.index
In [ ]:
s = pd.Series(['Tiger', 'Bear', 'Moose'], index=['India', 'America', 'Canada'])
s
In [3]:
sports = {'Archery': 'Bhutan',
          'Golf': 'Scotland',
          'Sumo': 'Japan',
          'Taekwondo': 'South Korea'}
s = pd.Series(sports, index=['Golf', 'Sumo', 'Hockey'])
s
Out[3]:
Golf      Scotland
Sumo         Japan
Hockey         NaN
dtype: object

Querying a Series

In [1]:
sports = {'Archery': 'Bhutan',
          'Golf': 'Scotland',
          'Sumo': 'Japan',
          'Taekwondo': 'South Korea'}
s = pd.Series(sports)
s
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-1-322d7d6ab47b> in <module>()
      3           'Sumo': 'Japan',
      4           'Taekwondo': 'South Korea'}
----> 5 s = pd.Series(sports)
      6 s

NameError: name 'pd' is not defined
In [ ]:
s.iloc[3]
In [ ]:
s.loc['Golf']
In [ ]:
s[3]
In [ ]:
s['Golf']
In [ ]:
sports = {99: 'Bhutan',
          100: 'Scotland',
          101: 'Japan',
          102: 'South Korea'}
s = pd.Series(sports)
In [ ]:
s[0] #This won't call s.iloc[0] as one might expect, it generates an error instead
In [ ]:
s = pd.Series([100.00, 120.00, 101.00, 3.00])
s
In [ ]:
total = 0
for item in s:
    total+=item
print(total)
In [ ]:
import numpy as np

total = np.sum(s)
print(total)
In [ ]:
#this creates a big series of random numbers
s = pd.Series(np.random.randint(0,1000,10000))
s.head()
In [ ]:
len(s)
In [ ]:
%%timeit -n 100
summary = 0
for item in s:
    summary+=item
In [ ]:
%%timeit -n 100
summary = np.sum(s)
In [ ]:
s+=2 #adds two to each item in s using broadcasting
s.head()
In [ ]:
for label, value in s.iteritems():
    s.set_value(label, value+2)
s.head()
In [ ]:
%%timeit -n 10
s = pd.Series(np.random.randint(0,1000,10000))
for label, value in s.iteritems():
    s.loc[label]= value+2
In [ ]:
%%timeit -n 10
s = pd.Series(np.random.randint(0,1000,10000))
s+=2
In [ ]:
s = pd.Series([1, 2, 3])
s.loc['Animal'] = 'Bears'
s
In [ ]:
original_sports = pd.Series({'Archery': 'Bhutan',
                             'Golf': 'Scotland',
                             'Sumo': 'Japan',
                             'Taekwondo': 'South Korea'})
cricket_loving_countries = pd.Series(['Australia',
                                      'Barbados',
                                      'Pakistan',
                                      'England'], 
                                   index=['Cricket',
                                          'Cricket',
                                          'Cricket',
                                          'Cricket'])
all_countries = original_sports.append(cricket_loving_countries)
In [ ]:
original_sports
In [ ]:
cricket_loving_countries
In [ ]:
all_countries
In [ ]:
all_countries.loc['Cricket']

The DataFrame Data Structure

In [1]:
import pandas as pd
purchase_1 = pd.Series({'Name': 'Chris',
                        'Item Purchased': 'Dog Food',
                        'Cost': 22.50})
purchase_2 = pd.Series({'Name': 'Kevyn',
                        'Item Purchased': 'Kitty Litter',
                        'Cost': 2.50})
purchase_3 = pd.Series({'Name': 'Vinod',
                        'Item Purchased': 'Bird Seed',
                        'Cost': 5.00})
df = pd.DataFrame([purchase_1, purchase_2, purchase_3], index=['Store 1', 'Store 1', 'Store 2'])
df.head()
Out[1]:
Cost Item Purchased Name
Store 1 22.5 Dog Food Chris
Store 1 2.5 Kitty Litter Kevyn
Store 2 5.0 Bird Seed Vinod
In [2]:
df.loc['Store 2']
Out[2]:
Cost                      5
Item Purchased    Bird Seed
Name                  Vinod
Name: Store 2, dtype: object
In [ ]:
type(df.loc['Store 2'])
In [ ]:
df.loc['Store 1']
In [3]:
df.loc['Store 1', 'Cost']
Out[3]:
Store 1    22.5
Store 1     2.5
Name: Cost, dtype: float64
In [ ]:
df.T
In [4]:
df.T.loc['Cost']
Out[4]:
Store 1    22.5
Store 1     2.5
Store 2       5
Name: Cost, dtype: object
In [ ]:
df['Cost']
In [ ]:
df.loc['Store 1']['Cost']
In [5]:
df.loc[:,['Name', 'Cost']]
Out[5]:
Name Cost
Store 1 Chris 22.5
Store 1 Kevyn 2.5
Store 2 Vinod 5.0
In [6]:
df.drop('Store 1')
Out[6]:
Cost Item Purchased Name
Store 2 5.0 Bird Seed Vinod
In [7]:
df
Out[7]:
Cost Item Purchased Name
Store 1 22.5 Dog Food Chris
Store 1 2.5 Kitty Litter Kevyn
Store 2 5.0 Bird Seed Vinod
In [38]:
copy_df = df.copy()
copy_df = copy_df.drop('Store 1')
copy_df
Out[38]:
Cost Item Purchased Name Location
Store 2 5.0 Bird Seed Vinod 6
In [35]:
copy_df.drop?
In [40]:
del copy_df['Name']
copy_df
Out[40]:
Cost Item Purchased Location
Store 2 5.0 Bird Seed 6
In [18]:
df['Location'] = None
df['Location'][2]=6
df
/home/sabodhapati/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
Out[18]:
Cost Item Purchased Name Location
Store 1 22.5 Dog Food Chris None
Store 1 2.5 Kitty Litter Kevyn None
Store 2 5.0 Bird Seed Vinod 6

Dataframe Indexing and Loading

In [ ]:
costs = df['Cost']
costs
In [ ]:
costs+=2
costs
In [ ]:
df
In [ ]:
!cat olympics.csv
In [ ]:
df = pd.read_csv('olympics.csv')
df.head()
In [ ]:
df = pd.read_csv('olympics.csv', index_col = 0, skiprows=1)
df.head()
In [ ]:
df.columns
In [ ]:
for col in df.columns:
    if col[:2]=='01':
        df.rename(columns={col:'Gold' + col[4:]}, inplace=True)
    if col[:2]=='02':
        df.rename(columns={col:'Silver' + col[4:]}, inplace=True)
    if col[:2]=='03':
        df.rename(columns={col:'Bronze' + col[4:]}, inplace=True)
    if col[:1]=='№':
        df.rename(columns={col:'#' + col[1:]}, inplace=True) 

df.head()

Querying a DataFrame

In [ ]:
df['Gold'] > 0
In [ ]:
only_gold = df.where(df['Gold'] > 0)
only_gold.head()
In [ ]:
only_gold['Gold'].count()
In [ ]:
df['Gold'].count()
In [ ]:
only_gold = only_gold.dropna()
only_gold.head()
In [ ]:
only_gold = df[df['Gold'] > 0]
only_gold.head()
In [ ]:
len(df[(df['Gold'] > 0) | (df['Gold.1'] > 0)])
In [ ]:
df[(df['Gold.1'] > 0) & (df['Gold'] == 0)]

Indexing Dataframes

In [ ]:
df.head()
In [ ]:
df['country'] = df.index
df = df.set_index('Gold')
df.head()
In [ ]:
df = df.reset_index()
df.head()
In [41]:
df = pd.read_csv('./Data_Science_with_Python/course1_downloads/census.csv')
df.head()
Out[41]:
SUMLEV REGION DIVISION STATE COUNTY STNAME CTYNAME CENSUS2010POP ESTIMATESBASE2010 POPESTIMATE2010 ... RDOMESTICMIG2011 RDOMESTICMIG2012 RDOMESTICMIG2013 RDOMESTICMIG2014 RDOMESTICMIG2015 RNETMIG2011 RNETMIG2012 RNETMIG2013 RNETMIG2014 RNETMIG2015
0 40 3 6 1 0 Alabama Alabama 4779736 4780127 4785161 ... 0.002295 -0.193196 0.381066 0.582002 -0.467369 1.030015 0.826644 1.383282 1.724718 0.712594
1 50 3 6 1 1 Alabama Autauga County 54571 54571 54660 ... 7.242091 -2.915927 -3.012349 2.265971 -2.530799 7.606016 -2.626146 -2.722002 2.592270 -2.187333
2 50 3 6 1 3 Alabama Baldwin County 182265 182265 183193 ... 14.832960 17.647293 21.845705 19.243287 17.197872 15.844176 18.559627 22.727626 20.317142 18.293499
3 50 3 6 1 5 Alabama Barbour County 27457 27457 27341 ... -4.728132 -2.500690 -7.056824 -3.904217 -10.543299 -4.874741 -2.758113 -7.167664 -3.978583 -10.543299
4 50 3 6 1 7 Alabama Bibb County 22915 22919 22861 ... -5.527043 -5.068871 -6.201001 -0.177537 0.177258 -5.088389 -4.363636 -5.403729 0.754533 1.107861

5 rows × 100 columns

In [43]:
df['SUMLEV'].unique()
Out[43]:
array([40, 50])
In [ ]:
df=df[df['SUMLEV'] == 50]
df.head()
In [44]:
columns_to_keep = ['STNAME',
                   'CTYNAME',
                   'BIRTHS2010',
                   'BIRTHS2011',
                   'BIRTHS2012',
                   'BIRTHS2013',
                   'BIRTHS2014',
                   'BIRTHS2015',
                   'POPESTIMATE2010',
                   'POPESTIMATE2011',
                   'POPESTIMATE2012',
                   'POPESTIMATE2013',
                   'POPESTIMATE2014',
                   'POPESTIMATE2015']
df = df[columns_to_keep]
df.head()
Out[44]:
STNAME CTYNAME BIRTHS2010 BIRTHS2011 BIRTHS2012 BIRTHS2013 BIRTHS2014 BIRTHS2015 POPESTIMATE2010 POPESTIMATE2011 POPESTIMATE2012 POPESTIMATE2013 POPESTIMATE2014 POPESTIMATE2015
0 Alabama Alabama 14226 59689 59062 57938 58334 58305 4785161 4801108 4816089 4830533 4846411 4858979
1 Alabama Autauga County 151 636 615 574 623 600 54660 55253 55175 55038 55290 55347
2 Alabama Baldwin County 517 2187 2092 2160 2186 2240 183193 186659 190396 195126 199713 203709
3 Alabama Barbour County 70 335 300 283 260 269 27341 27226 27159 26973 26815 26489
4 Alabama Bibb County 44 266 245 259 247 253 22861 22733 22642 22512 22549 22583
In [45]:
df = df.set_index(['STNAME', 'CTYNAME'])
df.head()
Out[45]:
BIRTHS2010 BIRTHS2011 BIRTHS2012 BIRTHS2013 BIRTHS2014 BIRTHS2015 POPESTIMATE2010 POPESTIMATE2011 POPESTIMATE2012 POPESTIMATE2013 POPESTIMATE2014 POPESTIMATE2015
STNAME CTYNAME
Alabama Alabama 14226 59689 59062 57938 58334 58305 4785161 4801108 4816089 4830533 4846411 4858979
Autauga County 151 636 615 574 623 600 54660 55253 55175 55038 55290 55347
Baldwin County 517 2187 2092 2160 2186 2240 183193 186659 190396 195126 199713 203709
Barbour County 70 335 300 283 260 269 27341 27226 27159 26973 26815 26489
Bibb County 44 266 245 259 247 253 22861 22733 22642 22512 22549 22583
In [46]:
df.loc['Michigan', 'Washtenaw County']
/home/sabodhapati/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:1: PerformanceWarning: indexing past lexsort depth may impact performance.
  """Entry point for launching an IPython kernel.
Out[46]:
BIRTHS2010 BIRTHS2011 BIRTHS2012 BIRTHS2013 BIRTHS2014 BIRTHS2015 POPESTIMATE2010 POPESTIMATE2011 POPESTIMATE2012 POPESTIMATE2013 POPESTIMATE2014 POPESTIMATE2015
STNAME CTYNAME
Michigan Washtenaw County 977 3826 3780 3662 3683 3709 345563 349048 351213 354289 357029 358880
In [47]:
df.loc[ [('Michigan', 'Washtenaw County'),
         ('Michigan', 'Wayne County')] ]
Out[47]:
BIRTHS2010 BIRTHS2011 BIRTHS2012 BIRTHS2013 BIRTHS2014 BIRTHS2015 POPESTIMATE2010 POPESTIMATE2011 POPESTIMATE2012 POPESTIMATE2013 POPESTIMATE2014 POPESTIMATE2015
STNAME CTYNAME
Michigan Washtenaw County 977 3826 3780 3662 3683 3709 345563 349048 351213 354289 357029 358880
Wayne County 5918 23819 23270 23377 23607 23586 1815199 1801273 1792514 1775713 1766008 1759335

Missing values

In [ ]:
df = pd.read_csv('log.csv')
df
In [ ]:
df.fillna?
In [ ]:
df = df.set_index('time')
df = df.sort_index()
df
In [ ]:
df = df.reset_index()
df = df.set_index(['time', 'user'])
df
In [ ]:
df = df.fillna(method='ffill')
df.head()