清洗和预处理#

import pandas as pd

Explore#

drinks = pd.read_csv("data/drinks.csv")
drinks.head()
country beer_servings spirit_servings wine_servings total_litres_of_pure_alcohol continent
0 Afghanistan 0 0 0 0.0 Asia
1 Albania 89 132 54 4.9 Europe
2 Algeria 25 0 14 0.7 Africa
3 Andorra 245 138 312 12.4 Europe
4 Angola 217 57 45 5.9 Africa

Max & Min#

drinks["beer_servings"].nlargest(10)
117    376
45     361
62     347
65     346
98     343
135    343
188    333
81     313
129    306
140    297
Name: beer_servings, dtype: int64
drinks["beer_servings"].idxmax()
117

Count#

drinks["continent"].value_counts()
continent
Africa           53
Europe           45
Asia             44
North America    23
Oceania          16
South America    12
Name: count, dtype: int64
drinks["continent"].value_counts(normalize=True)
continent
Africa           0.274611
Europe           0.233161
Asia             0.227979
North America    0.119171
Oceania          0.082902
South America    0.062176
Name: proportion, dtype: float64
# access the Series index
drinks["continent"].value_counts().index
Index(['Africa', 'Europe', 'Asia', 'North America', 'Oceania',
       'South America'],
      dtype='object', name='continent')
# access the Series values
drinks["continent"].value_counts().array
<NumpyExtensionArray>
[np.int64(53), np.int64(45), np.int64(44), np.int64(23), np.int64(16),
 np.int64(12)]
Length: 6, dtype: int64
# any Series can be sorted by its values
drinks.continent.value_counts().sort_values()
continent
South America    12
Oceania          16
North America    23
Asia             44
Europe           45
Africa           53
Name: count, dtype: int64
# any Series can also be sorted by its index
drinks["continent"].value_counts().sort_index()
continent
Africa           53
Asia             44
Europe           45
North America    23
Oceania          16
South America    12
Name: count, dtype: int64
drinks["continent"].unique()
array(['Asia', 'Europe', 'Africa', 'North America', 'South America',
       'Oceania'], dtype=object)
drinks["continent"].nunique()
6

Sort#

movies = pd.read_csv("data/imdb_1000.csv")
movies.head()
star_rating title content_rating genre duration actors_list
0 9.3 The Shawshank Redemption R Crime 142 [u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt...
1 9.2 The Godfather R Crime 175 [u'Marlon Brando', u'Al Pacino', u'James Caan']
2 9.1 The Godfather: Part II R Crime 200 [u'Al Pacino', u'Robert De Niro', u'Robert Duv...
3 9.0 The Dark Knight PG-13 Action 152 [u'Christian Bale', u'Heath Ledger', u'Aaron E...
4 8.9 Pulp Fiction R Crime 154 [u'John Travolta', u'Uma Thurman', u'Samuel L....
# sort the 'title' Series in ascending order (returns a Series)
movies.title.sort_values().head()
542     (500) Days of Summer
5               12 Angry Men
201         12 Years a Slave
698                127 Hours
110    2001: A Space Odyssey
Name: title, dtype: object
# sort in descending order instead
movies.title.sort_values(ascending=False).head()
864               [Rec]
526                Zulu
615          Zombieland
677              Zodiac
955    Zero Dark Thirty
Name: title, dtype: object
# sort the entire DataFrame by the 'title' Series (returns a DataFrame)
movies.sort_values("title").head()
star_rating title content_rating genre duration actors_list
542 7.8 (500) Days of Summer PG-13 Comedy 95 [u'Zooey Deschanel', u'Joseph Gordon-Levitt', ...
5 8.9 12 Angry Men NOT RATED Drama 96 [u'Henry Fonda', u'Lee J. Cobb', u'Martin Bals...
201 8.1 12 Years a Slave R Biography 134 [u'Chiwetel Ejiofor', u'Michael Kenneth Willia...
698 7.6 127 Hours R Adventure 94 [u'James Franco', u'Amber Tamblyn', u'Kate Mara']
110 8.3 2001: A Space Odyssey G Mystery 160 [u'Keir Dullea', u'Gary Lockwood', u'William S...
# sort in descending order instead
movies.sort_values("title", ascending=False).head()
star_rating title content_rating genre duration actors_list
864 7.5 [Rec] R Horror 78 [u'Manuela Velasco', u'Ferran Terraza', u'Jorg...
526 7.8 Zulu UNRATED Drama 138 [u'Stanley Baker', u'Jack Hawkins', u'Ulla Jac...
615 7.7 Zombieland R Comedy 88 [u'Jesse Eisenberg', u'Emma Stone', u'Woody Ha...
677 7.7 Zodiac R Crime 157 [u'Jake Gyllenhaal', u'Robert Downey Jr.', u'M...
955 7.4 Zero Dark Thirty R Drama 157 [u'Jessica Chastain', u'Joel Edgerton', u'Chri...
# sort the DataFrame first by 'content_rating', then by 'duration'
movies.sort_values(["content_rating", "duration"]).head()
star_rating title content_rating genre duration actors_list
713 7.6 The Jungle Book APPROVED Animation 78 [u'Phil Harris', u'Sebastian Cabot', u'Louis P...
513 7.8 Invasion of the Body Snatchers APPROVED Horror 80 [u'Kevin McCarthy', u'Dana Wynter', u'Larry Ga...
272 8.1 The Killing APPROVED Crime 85 [u'Sterling Hayden', u'Coleen Gray', u'Vince E...
703 7.6 Dracula APPROVED Horror 85 [u'Bela Lugosi', u'Helen Chandler', u'David Ma...
612 7.7 A Hard Day's Night APPROVED Comedy 87 [u'John Lennon', u'Paul McCartney', u'George H...

Correlation#

pd.crosstab(drinks["country"], drinks["continent"])
continent Africa Asia Europe North America Oceania South America
country
Afghanistan 0 1 0 0 0 0
Albania 0 0 1 0 0 0
Algeria 1 0 0 0 0 0
Andorra 0 0 1 0 0 0
Angola 1 0 0 0 0 0
... ... ... ... ... ... ...
Venezuela 0 0 0 0 0 1
Vietnam 0 1 0 0 0 0
Yemen 0 1 0 0 0 0
Zambia 1 0 0 0 0 0
Zimbabwe 1 0 0 0 0 0

193 rows × 6 columns

Transform#

Group#

drinks = pd.read_csv("data/drinks.csv")
drinks.head()
country beer_servings spirit_servings wine_servings total_litres_of_pure_alcohol continent
0 Afghanistan 0 0 0 0.0 Asia
1 Albania 89 132 54 4.9 Europe
2 Algeria 25 0 14 0.7 Africa
3 Andorra 245 138 312 12.4 Europe
4 Angola 217 57 45 5.9 Africa
drinks.groupby("continent").nth(5)
country beer_servings spirit_servings wine_servings total_litres_of_pure_alcohol continent
15 Belarus 142 373 42 14.4 Europe
27 Burundi 88 0 0 6.3 Africa
30 Cambodia 57 65 1 2.2 Asia
41 Costa Rica 149 87 11 4.4 North America
52 Ecuador 162 74 3 4.2 South America
110 Micronesia 62 50 18 2.3 Oceania
# calculate the mean beer servings for each continent
drinks.groupby("continent")["beer_servings"].mean()
continent
Africa            61.471698
Asia              37.045455
Europe           193.777778
North America    145.434783
Oceania           89.687500
South America    175.083333
Name: beer_servings, dtype: float64
# other aggregation functions (such as 'max') can also be used with groupby
drinks.groupby("continent")["beer_servings"].max()
continent
Africa           376
Asia             247
Europe           361
North America    285
Oceania          306
South America    333
Name: beer_servings, dtype: int64
# multiple aggregation functions can be applied simultaneously
drinks.groupby("continent")["beer_servings"].agg(["count", "mean", "min", "max"])
count mean min max
continent
Africa 53 61.471698 0 376
Asia 44 37.045455 0 247
Europe 45 193.777778 0 361
North America 23 145.434783 1 285
Oceania 16 89.687500 0 306
South America 12 175.083333 93 333

Apply#

drinks["average_wine_servings"] = drinks.groupby("continent")[
    "wine_servings"
].transform("mean")
drinks.head()
country beer_servings spirit_servings wine_servings total_litres_of_pure_alcohol continent average_wine_servings
0 Afghanistan 0 0 0 0.0 Asia 9.068182
1 Albania 89 132 54 4.9 Europe 142.222222
2 Algeria 25 0 14 0.7 Africa 16.264151
3 Andorra 245 138 312 12.4 Europe 142.222222
4 Angola 217 57 45 5.9 Africa 16.264151
drinks["average_wine_servings"] = drinks["average_wine_servings"].apply(
    lambda x: x / 100
)
drinks.head()
country beer_servings spirit_servings wine_servings total_litres_of_pure_alcohol continent average_wine_servings
0 Afghanistan 0 0 0 0.0 Asia 0.090682
1 Albania 89 132 54 4.9 Europe 1.422222
2 Algeria 25 0 14 0.7 Africa 0.162642
3 Andorra 245 138 312 12.4 Europe 1.422222
4 Angola 217 57 45 5.9 Africa 0.162642
iris = pd.read_csv("data/iris.csv")
iris.head()
sepal_length sepal_width petal_length petal_width species
0 5.1 3.5 1.4 0.2 setosa
1 4.9 3.0 1.4 0.2 setosa
2 4.7 3.2 1.3 0.2 setosa
3 4.6 3.1 1.5 0.2 setosa
4 5.0 3.6 1.4 0.2 setosa
iris.groupby("species").size()
species
setosa        50
versicolor    50
virginica     50
dtype: int64
iris.groupby("species").idxmax()
sepal_length sepal_width petal_length petal_width
species
setosa 14 15 24 43
versicolor 50 85 83 70
virginica 131 117 118 100
iris.groupby("species").agg(
    sepal_min=("sepal_length", "min"),
    sepal_max=("sepal_length", "max"),
    petal_mean=("petal_length", "mean"),
    petal_std=("petal_length", "std"),
)
sepal_min sepal_max petal_mean petal_std
species
setosa 4.3 5.8 1.462 0.173664
versicolor 4.9 7.0 4.260 0.469911
virginica 4.9 7.9 5.552 0.551895

Apply a function#

train = pd.read_csv("data/titanic_train.csv")
train.head()
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
# map 'female' to 0 and 'male' to 1
train["Sex_num"] = train.Sex.map({"female": 0, "male": 1})
train.loc[0:4, ["Sex", "Sex_num"]]
Sex Sex_num
0 male 1
1 female 0
2 female 0
3 female 0
4 male 1
# calculate the length of each string in the 'Name' Series
train["Name_length"] = train["Name"].apply(len)
train.loc[0:4, ["Name", "Name_length"]]
Name Name_length
0 Braund, Mr. Owen Harris 23
1 Cumings, Mrs. John Bradley (Florence Briggs Th... 51
2 Heikkinen, Miss. Laina 22
3 Futrelle, Mrs. Jacques Heath (Lily May Peel) 44
4 Allen, Mr. William Henry 24
# use a string method to split the 'Name' Series at commas (returns a Series of lists)
train["Name"].str.split(",").head()
0                           [Braund,  Mr. Owen Harris]
1    [Cumings,  Mrs. John Bradley (Florence Briggs ...
2                            [Heikkinen,  Miss. Laina]
3      [Futrelle,  Mrs. Jacques Heath (Lily May Peel)]
4                          [Allen,  Mr. William Henry]
Name: Name, dtype: object
#  use a lambda function
train["Name"].str.split(",").apply(lambda x: x[0]).head()
0       Braund
1      Cumings
2    Heikkinen
3     Futrelle
4        Allen
Name: Name, dtype: object

MultiIndex#

stocks = pd.read_csv("data/stocks.csv")
stocks
Date Close Volume Symbol
0 2016-10-03 31.50 14070500 CSCO
1 2016-10-03 112.52 21701800 AAPL
2 2016-10-03 57.42 19189500 MSFT
3 2016-10-04 113.00 29736800 AAPL
4 2016-10-04 57.24 20085900 MSFT
5 2016-10-04 31.35 18460400 CSCO
6 2016-10-05 57.64 16726400 MSFT
7 2016-10-05 31.59 11808600 CSCO
8 2016-10-05 113.05 21453100 AAPL
stocks.index
RangeIndex(start=0, stop=9, step=1)
ser = stocks.groupby(["Symbol", "Date"])["Close"].mean()
ser
Symbol  Date      
AAPL    2016-10-03    112.52
        2016-10-04    113.00
        2016-10-05    113.05
CSCO    2016-10-03     31.50
        2016-10-04     31.35
        2016-10-05     31.59
MSFT    2016-10-03     57.42
        2016-10-04     57.24
        2016-10-05     57.64
Name: Close, dtype: float64
ser.index
MultiIndex([('AAPL', '2016-10-03'),
            ('AAPL', '2016-10-04'),
            ('AAPL', '2016-10-05'),
            ('CSCO', '2016-10-03'),
            ('CSCO', '2016-10-04'),
            ('CSCO', '2016-10-05'),
            ('MSFT', '2016-10-03'),
            ('MSFT', '2016-10-04'),
            ('MSFT', '2016-10-05')],
           names=['Symbol', 'Date'])
ser
Symbol  Date      
AAPL    2016-10-03    112.52
        2016-10-04    113.00
        2016-10-05    113.05
CSCO    2016-10-03     31.50
        2016-10-04     31.35
        2016-10-05     31.59
MSFT    2016-10-03     57.42
        2016-10-04     57.24
        2016-10-05     57.64
Name: Close, dtype: float64
ser.to_frame().pivot_table(index="Symbol", columns="Date")
Close
Date 2016-10-03 2016-10-04 2016-10-05
Symbol
AAPL 112.52 113.00 113.05
CSCO 31.50 31.35 31.59
MSFT 57.42 57.24 57.64
stocks = stocks.set_index(["Symbol", "Date"])
stocks
Close Volume
Symbol Date
CSCO 2016-10-03 31.50 14070500
AAPL 2016-10-03 112.52 21701800
MSFT 2016-10-03 57.42 19189500
AAPL 2016-10-04 113.00 29736800
MSFT 2016-10-04 57.24 20085900
CSCO 2016-10-04 31.35 18460400
MSFT 2016-10-05 57.64 16726400
CSCO 2016-10-05 31.59 11808600
AAPL 2016-10-05 113.05 21453100
stocks.index
MultiIndex([('CSCO', '2016-10-03'),
            ('AAPL', '2016-10-03'),
            ('MSFT', '2016-10-03'),
            ('AAPL', '2016-10-04'),
            ('MSFT', '2016-10-04'),
            ('CSCO', '2016-10-04'),
            ('MSFT', '2016-10-05'),
            ('CSCO', '2016-10-05'),
            ('AAPL', '2016-10-05')],
           names=['Symbol', 'Date'])
stocks = stocks.sort_index()
stocks
Close Volume
Symbol Date
AAPL 2016-10-03 112.52 21701800
2016-10-04 113.00 29736800
2016-10-05 113.05 21453100
CSCO 2016-10-03 31.50 14070500
2016-10-04 31.35 18460400
2016-10-05 31.59 11808600
MSFT 2016-10-03 57.42 19189500
2016-10-04 57.24 20085900
2016-10-05 57.64 16726400
stocks.loc[("AAPL", "2016-10-03"), :]
Close          112.52
Volume    21701800.00
Name: (AAPL, 2016-10-03), dtype: float64
stocks.loc[("AAPL", "2016-10-03"), "Close"]
np.float64(112.52)
stocks.loc[["AAPL", "MSFT"], :]
Close Volume
Symbol Date
AAPL 2016-10-03 112.52 21701800
2016-10-04 113.00 29736800
2016-10-05 113.05 21453100
MSFT 2016-10-03 57.42 19189500
2016-10-04 57.24 20085900
2016-10-05 57.64 16726400
stocks.loc[(["AAPL", "MSFT"], "2016-10-03"), :]
Close Volume
Symbol Date
AAPL 2016-10-03 112.52 21701800
MSFT 2016-10-03 57.42 19189500
stocks.loc[(["AAPL", "MSFT"], "2016-10-03"), "Close"]
Symbol  Date      
AAPL    2016-10-03    112.52
MSFT    2016-10-03     57.42
Name: Close, dtype: float64
stocks.loc[(slice(None), ["2016-10-03", "2016-10-04"]), :]
Close Volume
Symbol Date
AAPL 2016-10-03 112.52 21701800
2016-10-04 113.00 29736800
CSCO 2016-10-03 31.50 14070500
2016-10-04 31.35 18460400
MSFT 2016-10-03 57.42 19189500
2016-10-04 57.24 20085900

Pivot#

stocks = stocks.pivot_table(values="Close", index="Symbol", columns="Date")
stocks
Date 2016-10-03 2016-10-04 2016-10-05
Symbol
AAPL 112.52 113.00 113.05
CSCO 31.50 31.35 31.59
MSFT 57.42 57.24 57.64

Clean#

Remove duplicate rows#

user_cols = ["user_id", "age", "gender", "occupation", "zip_code"]
users = pd.read_csv(
    "data/movie.user", sep="|", header=None, names=user_cols, index_col="user_id"
)
users.head()
age gender occupation zip_code
user_id
1 24 M technician 85711
2 53 F other 94043
3 23 M writer 32067
4 24 M technician 43537
5 33 F other 15213
users.shape
(943, 4)
# detect duplicate zip codes: True if an item is identical to a previous item
users["zip_code"].duplicated().tail()
user_id
939    False
940     True
941    False
942    False
943    False
Name: zip_code, dtype: bool
# count the duplicate items (True becomes 1, False becomes 0)
users["zip_code"].duplicated().sum()
np.int64(148)
# examine the duplicate rows (ignoring the first occurrence)
users.loc[users.duplicated(keep="first"), :]
age gender occupation zip_code
user_id
496 21 F student 55414
572 51 M educator 20003
621 17 M student 60402
684 28 M student 55414
733 44 F other 60630
805 27 F other 20009
890 32 M student 97301
# examine the duplicate rows (ignoring the last occurrence)
users.loc[users.duplicated(keep="last"), :]
age gender occupation zip_code
user_id
67 17 M student 60402
85 51 M educator 20003
198 21 F student 55414
350 32 M student 97301
428 28 M student 55414
437 27 F other 20009
460 44 F other 60630
# examine the duplicate rows (including all duplicates)
users.loc[users.duplicated(keep=False), :]
age gender occupation zip_code
user_id
67 17 M student 60402
85 51 M educator 20003
198 21 F student 55414
350 32 M student 97301
428 28 M student 55414
437 27 F other 20009
460 44 F other 60630
496 21 F student 55414
572 51 M educator 20003
621 17 M student 60402
684 28 M student 55414
733 44 F other 60630
805 27 F other 20009
890 32 M student 97301
# drop the duplicate rows (inplace=False by default)
users.drop_duplicates(keep="first").shape
(936, 4)
users.drop_duplicates(keep="last").shape
(936, 4)
users.drop_duplicates(keep=False).shape
(929, 4)
# only consider a subset of columns when identifying duplicates
users.duplicated(subset=["age", "zip_code"]).sum()
np.int64(16)
users.drop_duplicates(subset=["age", "zip_code"]).shape
(927, 4)

Detect missing values#

ufo = pd.read_csv("data/ufo.csv")
ufo.tail()
City Colors Reported Shape Reported State Time
18236 Grant Park NaN TRIANGLE IL 12/31/2000 23:00
18237 Spirit Lake NaN DISK IA 12/31/2000 23:00
18238 Eagle River NaN NaN WI 12/31/2000 23:45
18239 Eagle River RED LIGHT WI 12/31/2000 23:45
18240 Ybor NaN OVAL FL 12/31/2000 23:59
ufo.isna().tail()
City Colors Reported Shape Reported State Time
18236 False True False False False
18237 False True False False False
18238 False True True False False
18239 False False False False False
18240 False True False False False
ufo.notna().tail()
City Colors Reported Shape Reported State Time
18236 True False True True True
18237 True False True True True
18238 True False False True True
18239 True True True True True
18240 True False True True True
# count the number of missing values in each Series
ufo.isna().sum()
City                  26
Colors Reported    15359
Shape Reported      2644
State                  0
Time                   0
dtype: int64
# use the 'isna' Series method to filter the DataFrame rows
ufo.loc[ufo.City.isna()].head()
City Colors Reported Shape Reported State Time
21 NaN NaN NaN LA 8/15/1943 0:00
22 NaN NaN LIGHT LA 8/15/1943 0:00
204 NaN NaN DISK CA 7/15/1952 12:30
241 NaN BLUE DISK MT 7/4/1953 14:00
613 NaN NaN DISK NV 7/1/1960 12:00
# examine the number of rows and columns
ufo.shape
(18241, 5)
# if 'any' values are missing in a row, then drop that row
ufo.dropna(how="any").shape
(2486, 5)
# 'inplace' parameter for 'dropna' is False by default, thus rows were only dropped temporarily
ufo.shape
(18241, 5)
# if 'all' values are missing in a row, then drop that row (none are dropped in this case)
ufo.dropna(how="all").shape
(18241, 5)
# if 'any' values are missing in a row (considering only 'City' and 'Shape Reported'), then drop that row
ufo.dropna(subset=["City", "Shape Reported"], how="any").shape
(15575, 5)
# 'value_counts' does not include missing values by default
ufo["Shape Reported"].value_counts().head()
Shape Reported
LIGHT       2803
DISK        2122
TRIANGLE    1889
OTHER       1402
CIRCLE      1365
Name: count, dtype: int64
# explicitly include missing values
ufo["Shape Reported"].value_counts(dropna=False).head()
Shape Reported
LIGHT       2803
NaN         2644
DISK        2122
TRIANGLE    1889
OTHER       1402
Name: count, dtype: int64

Fill missing values#

# fill in missing values with a specified value
ufo["Shape Reported"] = ufo["Shape Reported"].fillna(value="VARIOUS")
# confirm that the missing values were filled in
ufo["Shape Reported"].value_counts().head()
Shape Reported
VARIOUS     2977
LIGHT       2803
DISK        2122
TRIANGLE    1889
OTHER       1402
Name: count, dtype: int64
# fill in missing values
ufo.fillna(value="UNKNOWN").head()
City Colors Reported Shape Reported State Time
0 Ithaca UNKNOWN TRIANGLE NY 6/1/1930 22:00
1 Willingboro UNKNOWN OTHER NJ 6/30/1930 20:00
2 Holyoke UNKNOWN OVAL CO 2/15/1931 14:00
3 Abilene UNKNOWN DISK KS 6/1/1931 13:00
4 New York Worlds Fair UNKNOWN LIGHT NY 4/18/1933 19:00
# fill missing values using "backward fill" strategy (doesn't affect the DataFrame since inplace=False)
ufo.fillna(method="bfill").tail()
/tmp/ipykernel_1846/278037071.py:2: FutureWarning: DataFrame.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.
  ufo.fillna(method="bfill").tail()
City Colors Reported Shape Reported State Time
18236 Grant Park RED TRIANGLE IL 12/31/2000 23:00
18237 Spirit Lake RED DISK IA 12/31/2000 23:00
18238 Eagle River RED VARIOUS WI 12/31/2000 23:45
18239 Eagle River RED LIGHT WI 12/31/2000 23:45
18240 Ybor NaN OVAL FL 12/31/2000 23:59
# compare with "forward fill" strategy (doesn't affect the DataFrame since inplace=False)
ufo.fillna(method="ffill").tail()
/tmp/ipykernel_1846/2160560122.py:2: FutureWarning: DataFrame.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.
  ufo.fillna(method="ffill").tail()
City Colors Reported Shape Reported State Time
18236 Grant Park RED TRIANGLE IL 12/31/2000 23:00
18237 Spirit Lake RED DISK IA 12/31/2000 23:00
18238 Eagle River RED VARIOUS WI 12/31/2000 23:45
18239 Eagle River RED LIGHT WI 12/31/2000 23:45
18240 Ybor RED OVAL FL 12/31/2000 23:59

Where & Mask#

train = pd.read_csv("data/titanic_train.csv")
train.head()
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
cond = train["Sex"] == "male"
# replace the values unsatisfied the condition, default is NaN
train["Sex"] = train["Sex"].where(cond, other="Female")
train.head()
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... Female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina Female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) Female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
train["quality"] = ""
cond1 = train["Sex"] == "male"
cond2 = train["Age"] > 25
# display the values unsatisfied the condition
train["quality"] = train["quality"].where(cond1 & cond2, other="low")
train.head()
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked quality
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S low
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... Female 38.0 1 0 PC 17599 71.2833 C85 C low
2 3 1 3 Heikkinen, Miss. Laina Female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S low
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) Female 35.0 1 0 113803 53.1000 C123 S low
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
train["quality"] = ""
cond1 = train["Sex"] == "male"
cond2 = train["Age"] > 25
# mask the values unsatisfied the condition
train["quality"] = train["quality"].mask(cond1 & cond2, other="low")
train.head()
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked quality
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... Female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina Female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) Female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S low

Outliers (clip)#

drinks["wine_servings"].clip(10, 300)
0       10
1       54
2       14
3      300
4       45
      ... 
188     10
189     10
190     10
191     10
192     10
Name: wine_servings, Length: 193, dtype: int64

String & Time#

Change the data type#

drinks = pd.read_csv("data/drinks.csv")
drinks.head()
country beer_servings spirit_servings wine_servings total_litres_of_pure_alcohol continent
0 Afghanistan 0 0 0 0.0 Asia
1 Albania 89 132 54 4.9 Europe
2 Algeria 25 0 14 0.7 Africa
3 Andorra 245 138 312 12.4 Europe
4 Angola 217 57 45 5.9 Africa
# examine the data type of each Series
drinks.dtypes
country                          object
beer_servings                     int64
spirit_servings                   int64
wine_servings                     int64
total_litres_of_pure_alcohol    float64
continent                        object
dtype: object
drinks.convert_dtypes().dtypes
country                         string[python]
beer_servings                            Int64
spirit_servings                          Int64
wine_servings                            Int64
total_litres_of_pure_alcohol           Float64
continent                       string[python]
dtype: object
# change the data type of an existing Series
drinks["beer_servings"] = drinks["beer_servings"].astype(float)
drinks.dtypes
country                          object
beer_servings                   float64
spirit_servings                   int64
wine_servings                     int64
total_litres_of_pure_alcohol    float64
continent                        object
dtype: object

Use string methods#

orders = pd.read_csv("data/chipotle.tsv", sep="\t")
orders.head()
order_id quantity item_name choice_description item_price
0 1 1 Chips and Fresh Tomato Salsa NaN $2.39
1 1 1 Izze [Clementine] $3.39
2 1 1 Nantucket Nectar [Apple] $3.39
3 1 1 Chips and Tomatillo-Green Chili Salsa NaN $2.39
4 2 2 Chicken Bowl [Tomatillo-Red Chili Salsa (Hot), [Black Beans... $16.98
# string methods for pandas Series are accessed via 'str'
orders.item_name.str.upper().head()
0             CHIPS AND FRESH TOMATO SALSA
1                                     IZZE
2                         NANTUCKET NECTAR
3    CHIPS AND TOMATILLO-GREEN CHILI SALSA
4                             CHICKEN BOWL
Name: item_name, dtype: object
# use the boolean Series to filter the DataFrame
orders[orders.item_name.str.contains("Chicken")].head()
order_id quantity item_name choice_description item_price
4 2 2 Chicken Bowl [Tomatillo-Red Chili Salsa (Hot), [Black Beans... $16.98
5 3 1 Chicken Bowl [Fresh Tomato Salsa (Mild), [Rice, Cheese, Sou... $10.98
11 6 1 Chicken Crispy Tacos [Roasted Chili Corn Salsa, [Fajita Vegetables,... $8.75
12 6 1 Chicken Soft Tacos [Roasted Chili Corn Salsa, [Rice, Black Beans,... $8.75
13 7 1 Chicken Bowl [Fresh Tomato Salsa, [Fajita Vegetables, Rice,... $11.25
# string methods can be chained together
orders.choice_description.str.replace("[", "").str.replace("]", "").head()
0                                                  NaN
1                                           Clementine
2                                                Apple
3                                                  NaN
4    Tomatillo-Red Chili Salsa (Hot), Black Beans, ...
Name: choice_description, dtype: object
# many pandas string methods support regular expressions (regex)
orders.choice_description.str.replace(r"[\[\]]", "").head()
0                                                  NaN
1                                         [Clementine]
2                                              [Apple]
3                                                  NaN
4    [Tomatillo-Red Chili Salsa (Hot), [Black Beans...
Name: choice_description, dtype: object

Dates & Times#

# read a dataset of UFO reports into a DataFrame
ufo = pd.read_csv("data/ufo.csv")
ufo.head()
City Colors Reported Shape Reported State Time
0 Ithaca NaN TRIANGLE NY 6/1/1930 22:00
1 Willingboro NaN OTHER NJ 6/30/1930 20:00
2 Holyoke NaN OVAL CO 2/15/1931 14:00
3 Abilene NaN DISK KS 6/1/1931 13:00
4 New York Worlds Fair NaN LIGHT NY 4/18/1933 19:00
# 'Time' is currently stored as a string
ufo.dtypes
City               object
Colors Reported    object
Shape Reported     object
State              object
Time               object
dtype: object
# convert 'Time' to datetime format
ufo["Time"] = pd.to_datetime(ufo.Time)
ufo.head()
City Colors Reported Shape Reported State Time
0 Ithaca NaN TRIANGLE NY 1930-06-01 22:00:00
1 Willingboro NaN OTHER NJ 1930-06-30 20:00:00
2 Holyoke NaN OVAL CO 1931-02-15 14:00:00
3 Abilene NaN DISK KS 1931-06-01 13:00:00
4 New York Worlds Fair NaN LIGHT NY 1933-04-18 19:00:00
ufo.dtypes
City                       object
Colors Reported            object
Shape Reported             object
State                      object
Time               datetime64[ns]
dtype: object
ufo = ufo.set_index("Time")
ufo.head()
City Colors Reported Shape Reported State
Time
1930-06-01 22:00:00 Ithaca NaN TRIANGLE NY
1930-06-30 20:00:00 Willingboro NaN OTHER NJ
1931-02-15 14:00:00 Holyoke NaN OVAL CO
1931-06-01 13:00:00 Abilene NaN DISK KS
1933-04-18 19:00:00 New York Worlds Fair NaN LIGHT NY
ufo.at_time("15:00")
City Colors Reported Shape Reported State
Time
1943-06-01 15:00:00 Nebraska NaN DISK NE
1947-07-15 15:00:00 Clintwood NaN CYLINDER VA
1947-07-15 15:00:00 Hazelton NaN DISK ID
1949-04-10 15:00:00 Seattle NaN DISK WA
1949-06-30 15:00:00 Knoxville NaN OTHER TN
... ... ... ... ...
2000-10-12 15:00:00 Mountain City NaN DISK TN
2000-10-28 15:00:00 Kintnersville ORANGE OTHER PA
2000-11-20 15:00:00 St. Joseph NaN EGG MO
2000-12-10 15:00:00 Marshfield NaN LIGHT WI
2000-12-27 15:00:00 Kea'au NaN SPHERE HI

254 rows × 4 columns

ufo.between_time("09:45", "12:00")
City Colors Reported Shape Reported State
Time
1941-07-02 11:30:00 Forest Home NaN CIRCLE CA
1943-10-15 11:00:00 Owensboro NaN RECTANGLE KY
1944-01-01 10:00:00 Wilderness NaN DISK WV
1944-01-01 12:00:00 San Diego NaN CIGAR CA
1944-01-01 12:00:00 Wilderness NaN DISK WV
... ... ... ... ...
2000-12-27 10:00:00 Santa Paula NaN LIGHT CA
2000-12-28 11:00:00 Cerrilillo NaN NaN NM
2000-12-28 12:00:00 Mansfield NaN FLASH TX
2000-12-30 10:00:00 Simi Valley NaN FORMATION CA
2000-12-31 11:45:00 Chicago NaN DISK IL

781 rows × 4 columns

Category data#

drinks = pd.read_csv("data/drinks.csv")
drinks.head()
country beer_servings spirit_servings wine_servings total_litres_of_pure_alcohol continent
0 Afghanistan 0 0 0 0.0 Asia
1 Albania 89 132 54 4.9 Europe
2 Algeria 25 0 14 0.7 Africa
3 Andorra 245 138 312 12.4 Europe
4 Angola 217 57 45 5.9 Africa
drinks = drinks.convert_dtypes()
drinks.dtypes
country                         string[python]
beer_servings                            Int64
spirit_servings                          Int64
wine_servings                            Int64
total_litres_of_pure_alcohol           Float64
continent                       string[python]
dtype: object
# calculate the memory usage for each Series (in bytes)
drinks.memory_usage(deep=True)
Index                             132
country                         12588
beer_servings                    1737
spirit_servings                  1737
wine_servings                    1737
total_litres_of_pure_alcohol     1737
continent                       12332
dtype: int64
# use the 'category' data type to store the 'continent' strings as integers
drinks["continent"] = drinks["continent"].astype("category")
drinks.dtypes
country                         string[python]
beer_servings                            Int64
spirit_servings                          Int64
wine_servings                            Int64
total_litres_of_pure_alcohol           Float64
continent                             category
dtype: object
# strings are now encoded (0 means 'Africa', 1 means 'Asia', 2 means 'Europe', etc.)
drinks["continent"].cat.codes.head()
0    1
1    2
2    0
3    2
4    0
dtype: int8
# memory usage has been drastically reduced
drinks.memory_usage(deep=True)
Index                             132
country                         12588
beer_servings                    1737
spirit_servings                  1737
wine_servings                    1737
total_litres_of_pure_alcohol     1737
continent                         756
dtype: int64
# repeat this process for the 'country' Series
drinks["country"] = drinks["country"].astype("category")
drinks.memory_usage(deep=True)
Index                             132
country                         17142
beer_servings                    1737
spirit_servings                  1737
wine_servings                    1737
total_litres_of_pure_alcohol     1737
continent                         756
dtype: int64
# memory usage increased because we created 193 categories
drinks["country"].cat.categories
Index(['Afghanistan', 'Albania', 'Algeria', 'Andorra', 'Angola',
       'Antigua & Barbuda', 'Argentina', 'Armenia', 'Australia', 'Austria',
       ...
       'United Arab Emirates', 'United Kingdom', 'Uruguay', 'Uzbekistan',
       'Vanuatu', 'Venezuela', 'Vietnam', 'Yemen', 'Zambia', 'Zimbabwe'],
      dtype='string', length=193)

Dummy variables#

train = pd.read_csv("data/titanic_train.csv")
train.head()
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
# use 'get_dummies' to create one column for every possible value
pd.get_dummies(train.Sex).head()
female male
0 False True
1 True False
2 True False
3 True False
4 False True
# drop the first dummy variable ('female') using the 'iloc' method
# add a prefix to identify the source of the dummy variables
pd.get_dummies(train.Sex, prefix="Sex").iloc[:, 1:].head()
Sex_male
0 True
1 False
2 False
3 False
4 True
# use 'get_dummies' with a feature that has 3 possible values
pd.get_dummies(train.Embarked, prefix="Embarked").head(10)
Embarked_C Embarked_Q Embarked_S
0 False False True
1 True False False
2 False False True
3 False False True
4 False False True
5 False True False
6 False False True
7 False False True
8 False False True
9 True False False
# drop the first dummy variable ('C')
pd.get_dummies(train.Embarked, prefix="Embarked").iloc[:, 1:].head(10)
Embarked_Q Embarked_S
0 False True
1 False False
2 False True
3 False True
4 False True
5 True False
6 False True
7 False True
8 False True
9 False False
# save the DataFrame of dummy variables and concatenate them to the original DataFrame
embarked_dummies = pd.get_dummies(train.Embarked, prefix="Embarked").iloc[:, 1:]
train = pd.concat([train, embarked_dummies], axis=1)
train.head()
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked Embarked_Q Embarked_S
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S False True
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C False False
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S False True
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S False True
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S False True
train = pd.read_csv("data/titanic_train.csv")
train.head()
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
# pass the DataFrame to 'get_dummies' and specify which columns to dummy (it drops the original columns)
pd.get_dummies(train, columns=["Sex", "Embarked"]).head()
PassengerId Survived Pclass Name Age SibSp Parch Ticket Fare Cabin Sex_female Sex_male Embarked_C Embarked_Q Embarked_S
0 1 0 3 Braund, Mr. Owen Harris 22.0 1 0 A/5 21171 7.2500 NaN False True False False True
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... 38.0 1 0 PC 17599 71.2833 C85 True False True False False
2 3 1 3 Heikkinen, Miss. Laina 26.0 0 0 STON/O2. 3101282 7.9250 NaN True False False False True
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) 35.0 1 0 113803 53.1000 C123 True False False False True
4 5 0 3 Allen, Mr. William Henry 35.0 0 0 373450 8.0500 NaN False True False False True
# use the 'drop_first' parameter (new in pandas 0.18) to drop the first dummy variable for each feature
pd.get_dummies(train, columns=["Sex", "Embarked"], drop_first=True).head()
PassengerId Survived Pclass Name Age SibSp Parch Ticket Fare Cabin Sex_male Embarked_Q Embarked_S
0 1 0 3 Braund, Mr. Owen Harris 22.0 1 0 A/5 21171 7.2500 NaN True False True
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... 38.0 1 0 PC 17599 71.2833 C85 False False False
2 3 1 3 Heikkinen, Miss. Laina 26.0 0 0 STON/O2. 3101282 7.9250 NaN False False True
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) 35.0 1 0 113803 53.1000 C123 False False True
4 5 0 3 Allen, Mr. William Henry 35.0 0 0 373450 8.0500 NaN True False True

Cardinal data#

# create a small DataFrame from a dictionary
small = pd.DataFrame(
    {"ID": [100, 101, 102, 103], "quality": ["good", "very good", "good", "excellent"]}
)
small.head()
ID quality
0 100 good
1 101 very good
2 102 good
3 103 excellent
# sort the DataFrame by the 'quality' Series (alphabetical order)
small.sort_values("quality")
ID quality
3 103 excellent
0 100 good
2 102 good
1 101 very good
from pandas.api.types import CategoricalDtype
# define a logical ordering for the categories
cats = ["good", "very good", "excellent"]
cat_type = CategoricalDtype(categories=cats, ordered=True)
small["quality"] = small.quality.astype(cat_type)
small.quality
0         good
1    very good
2         good
3    excellent
Name: quality, dtype: category
Categories (3, object): ['good' < 'very good' < 'excellent']
# sort the DataFrame by the 'quality' Series (logical order)
small.sort_values("quality")
ID quality
0 100 good
2 102 good
1 101 very good
3 103 excellent
# comparison operators work with ordered categories
small.loc[small.quality > "good", :]
ID quality
1 101 very good
3 103 excellent

Sample rows#

# use the 'random_state' parameter for reproducibility
ufo.sample(n=3, random_state=42)
City Colors Reported Shape Reported State
Time
1952-09-15 14:00:00 Norridgewock NaN DISK ME
1998-10-01 21:15:00 Ipava NaN TRIANGLE IL
2000-11-13 22:00:00 Ellinwood NaN FIREBALL KS
# sample 75% of the DataFrame's rows without replacement
train = ufo.sample(frac=0.75, random_state=99)
# store the remaining 25% of the rows in another DataFrame
test = ufo.loc[~ufo.index.isin(train.index), :]
test.head()
City Colors Reported Shape Reported State
Time
1933-04-18 19:00:00 New York Worlds Fair NaN LIGHT NY
1934-09-15 15:30:00 Valley City NaN DISK ND
1936-10-15 17:00:00 Eklutna NaN CIGAR AK
1939-06-01 20:00:00 Waterloo NaN FIREBALL AL
1939-07-07 02:00:00 Keokuk NaN OVAL IA