清洗和预处理#
import pandas as pd
Explore#
drinks = pd.read_csv("data/drinks.csv")
drinks.head()
country | beer_servings | spirit_servings | wine_servings | total_litres_of_pure_alcohol | continent | |
---|---|---|---|---|---|---|
0 | Afghanistan | 0 | 0 | 0 | 0.0 | Asia |
1 | Albania | 89 | 132 | 54 | 4.9 | Europe |
2 | Algeria | 25 | 0 | 14 | 0.7 | Africa |
3 | Andorra | 245 | 138 | 312 | 12.4 | Europe |
4 | Angola | 217 | 57 | 45 | 5.9 | Africa |
Max & Min#
drinks["beer_servings"].nlargest(10)
117 376
45 361
62 347
65 346
98 343
135 343
188 333
81 313
129 306
140 297
Name: beer_servings, dtype: int64
drinks["beer_servings"].idxmax()
117
Count#
drinks["continent"].value_counts()
continent
Africa 53
Europe 45
Asia 44
North America 23
Oceania 16
South America 12
Name: count, dtype: int64
drinks["continent"].value_counts(normalize=True)
continent
Africa 0.274611
Europe 0.233161
Asia 0.227979
North America 0.119171
Oceania 0.082902
South America 0.062176
Name: proportion, dtype: float64
# access the Series index
drinks["continent"].value_counts().index
Index(['Africa', 'Europe', 'Asia', 'North America', 'Oceania',
'South America'],
dtype='object', name='continent')
# access the Series values
drinks["continent"].value_counts().array
<NumpyExtensionArray>
[np.int64(53), np.int64(45), np.int64(44), np.int64(23), np.int64(16),
np.int64(12)]
Length: 6, dtype: int64
# any Series can be sorted by its values
drinks.continent.value_counts().sort_values()
continent
South America 12
Oceania 16
North America 23
Asia 44
Europe 45
Africa 53
Name: count, dtype: int64
# any Series can also be sorted by its index
drinks["continent"].value_counts().sort_index()
continent
Africa 53
Asia 44
Europe 45
North America 23
Oceania 16
South America 12
Name: count, dtype: int64
drinks["continent"].unique()
array(['Asia', 'Europe', 'Africa', 'North America', 'South America',
'Oceania'], dtype=object)
drinks["continent"].nunique()
6
Sort#
movies = pd.read_csv("data/imdb_1000.csv")
movies.head()
star_rating | title | content_rating | genre | duration | actors_list | |
---|---|---|---|---|---|---|
0 | 9.3 | The Shawshank Redemption | R | Crime | 142 | [u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt... |
1 | 9.2 | The Godfather | R | Crime | 175 | [u'Marlon Brando', u'Al Pacino', u'James Caan'] |
2 | 9.1 | The Godfather: Part II | R | Crime | 200 | [u'Al Pacino', u'Robert De Niro', u'Robert Duv... |
3 | 9.0 | The Dark Knight | PG-13 | Action | 152 | [u'Christian Bale', u'Heath Ledger', u'Aaron E... |
4 | 8.9 | Pulp Fiction | R | Crime | 154 | [u'John Travolta', u'Uma Thurman', u'Samuel L.... |
# sort the 'title' Series in ascending order (returns a Series)
movies.title.sort_values().head()
542 (500) Days of Summer
5 12 Angry Men
201 12 Years a Slave
698 127 Hours
110 2001: A Space Odyssey
Name: title, dtype: object
# sort in descending order instead
movies.title.sort_values(ascending=False).head()
864 [Rec]
526 Zulu
615 Zombieland
677 Zodiac
955 Zero Dark Thirty
Name: title, dtype: object
# sort the entire DataFrame by the 'title' Series (returns a DataFrame)
movies.sort_values("title").head()
star_rating | title | content_rating | genre | duration | actors_list | |
---|---|---|---|---|---|---|
542 | 7.8 | (500) Days of Summer | PG-13 | Comedy | 95 | [u'Zooey Deschanel', u'Joseph Gordon-Levitt', ... |
5 | 8.9 | 12 Angry Men | NOT RATED | Drama | 96 | [u'Henry Fonda', u'Lee J. Cobb', u'Martin Bals... |
201 | 8.1 | 12 Years a Slave | R | Biography | 134 | [u'Chiwetel Ejiofor', u'Michael Kenneth Willia... |
698 | 7.6 | 127 Hours | R | Adventure | 94 | [u'James Franco', u'Amber Tamblyn', u'Kate Mara'] |
110 | 8.3 | 2001: A Space Odyssey | G | Mystery | 160 | [u'Keir Dullea', u'Gary Lockwood', u'William S... |
# sort in descending order instead
movies.sort_values("title", ascending=False).head()
star_rating | title | content_rating | genre | duration | actors_list | |
---|---|---|---|---|---|---|
864 | 7.5 | [Rec] | R | Horror | 78 | [u'Manuela Velasco', u'Ferran Terraza', u'Jorg... |
526 | 7.8 | Zulu | UNRATED | Drama | 138 | [u'Stanley Baker', u'Jack Hawkins', u'Ulla Jac... |
615 | 7.7 | Zombieland | R | Comedy | 88 | [u'Jesse Eisenberg', u'Emma Stone', u'Woody Ha... |
677 | 7.7 | Zodiac | R | Crime | 157 | [u'Jake Gyllenhaal', u'Robert Downey Jr.', u'M... |
955 | 7.4 | Zero Dark Thirty | R | Drama | 157 | [u'Jessica Chastain', u'Joel Edgerton', u'Chri... |
# sort the DataFrame first by 'content_rating', then by 'duration'
movies.sort_values(["content_rating", "duration"]).head()
star_rating | title | content_rating | genre | duration | actors_list | |
---|---|---|---|---|---|---|
713 | 7.6 | The Jungle Book | APPROVED | Animation | 78 | [u'Phil Harris', u'Sebastian Cabot', u'Louis P... |
513 | 7.8 | Invasion of the Body Snatchers | APPROVED | Horror | 80 | [u'Kevin McCarthy', u'Dana Wynter', u'Larry Ga... |
272 | 8.1 | The Killing | APPROVED | Crime | 85 | [u'Sterling Hayden', u'Coleen Gray', u'Vince E... |
703 | 7.6 | Dracula | APPROVED | Horror | 85 | [u'Bela Lugosi', u'Helen Chandler', u'David Ma... |
612 | 7.7 | A Hard Day's Night | APPROVED | Comedy | 87 | [u'John Lennon', u'Paul McCartney', u'George H... |
Correlation#
pd.crosstab(drinks["country"], drinks["continent"])
continent | Africa | Asia | Europe | North America | Oceania | South America |
---|---|---|---|---|---|---|
country | ||||||
Afghanistan | 0 | 1 | 0 | 0 | 0 | 0 |
Albania | 0 | 0 | 1 | 0 | 0 | 0 |
Algeria | 1 | 0 | 0 | 0 | 0 | 0 |
Andorra | 0 | 0 | 1 | 0 | 0 | 0 |
Angola | 1 | 0 | 0 | 0 | 0 | 0 |
... | ... | ... | ... | ... | ... | ... |
Venezuela | 0 | 0 | 0 | 0 | 0 | 1 |
Vietnam | 0 | 1 | 0 | 0 | 0 | 0 |
Yemen | 0 | 1 | 0 | 0 | 0 | 0 |
Zambia | 1 | 0 | 0 | 0 | 0 | 0 |
Zimbabwe | 1 | 0 | 0 | 0 | 0 | 0 |
193 rows × 6 columns
Transform#
Group#
drinks = pd.read_csv("data/drinks.csv")
drinks.head()
country | beer_servings | spirit_servings | wine_servings | total_litres_of_pure_alcohol | continent | |
---|---|---|---|---|---|---|
0 | Afghanistan | 0 | 0 | 0 | 0.0 | Asia |
1 | Albania | 89 | 132 | 54 | 4.9 | Europe |
2 | Algeria | 25 | 0 | 14 | 0.7 | Africa |
3 | Andorra | 245 | 138 | 312 | 12.4 | Europe |
4 | Angola | 217 | 57 | 45 | 5.9 | Africa |
drinks.groupby("continent").nth(5)
country | beer_servings | spirit_servings | wine_servings | total_litres_of_pure_alcohol | continent | |
---|---|---|---|---|---|---|
15 | Belarus | 142 | 373 | 42 | 14.4 | Europe |
27 | Burundi | 88 | 0 | 0 | 6.3 | Africa |
30 | Cambodia | 57 | 65 | 1 | 2.2 | Asia |
41 | Costa Rica | 149 | 87 | 11 | 4.4 | North America |
52 | Ecuador | 162 | 74 | 3 | 4.2 | South America |
110 | Micronesia | 62 | 50 | 18 | 2.3 | Oceania |
# calculate the mean beer servings for each continent
drinks.groupby("continent")["beer_servings"].mean()
continent
Africa 61.471698
Asia 37.045455
Europe 193.777778
North America 145.434783
Oceania 89.687500
South America 175.083333
Name: beer_servings, dtype: float64
# other aggregation functions (such as 'max') can also be used with groupby
drinks.groupby("continent")["beer_servings"].max()
continent
Africa 376
Asia 247
Europe 361
North America 285
Oceania 306
South America 333
Name: beer_servings, dtype: int64
# multiple aggregation functions can be applied simultaneously
drinks.groupby("continent")["beer_servings"].agg(["count", "mean", "min", "max"])
count | mean | min | max | |
---|---|---|---|---|
continent | ||||
Africa | 53 | 61.471698 | 0 | 376 |
Asia | 44 | 37.045455 | 0 | 247 |
Europe | 45 | 193.777778 | 0 | 361 |
North America | 23 | 145.434783 | 1 | 285 |
Oceania | 16 | 89.687500 | 0 | 306 |
South America | 12 | 175.083333 | 93 | 333 |
Apply#
drinks["average_wine_servings"] = drinks.groupby("continent")[
"wine_servings"
].transform("mean")
drinks.head()
country | beer_servings | spirit_servings | wine_servings | total_litres_of_pure_alcohol | continent | average_wine_servings | |
---|---|---|---|---|---|---|---|
0 | Afghanistan | 0 | 0 | 0 | 0.0 | Asia | 9.068182 |
1 | Albania | 89 | 132 | 54 | 4.9 | Europe | 142.222222 |
2 | Algeria | 25 | 0 | 14 | 0.7 | Africa | 16.264151 |
3 | Andorra | 245 | 138 | 312 | 12.4 | Europe | 142.222222 |
4 | Angola | 217 | 57 | 45 | 5.9 | Africa | 16.264151 |
drinks["average_wine_servings"] = drinks["average_wine_servings"].apply(
lambda x: x / 100
)
drinks.head()
country | beer_servings | spirit_servings | wine_servings | total_litres_of_pure_alcohol | continent | average_wine_servings | |
---|---|---|---|---|---|---|---|
0 | Afghanistan | 0 | 0 | 0 | 0.0 | Asia | 0.090682 |
1 | Albania | 89 | 132 | 54 | 4.9 | Europe | 1.422222 |
2 | Algeria | 25 | 0 | 14 | 0.7 | Africa | 0.162642 |
3 | Andorra | 245 | 138 | 312 | 12.4 | Europe | 1.422222 |
4 | Angola | 217 | 57 | 45 | 5.9 | Africa | 0.162642 |
iris = pd.read_csv("data/iris.csv")
iris.head()
sepal_length | sepal_width | petal_length | petal_width | species | |
---|---|---|---|---|---|
0 | 5.1 | 3.5 | 1.4 | 0.2 | setosa |
1 | 4.9 | 3.0 | 1.4 | 0.2 | setosa |
2 | 4.7 | 3.2 | 1.3 | 0.2 | setosa |
3 | 4.6 | 3.1 | 1.5 | 0.2 | setosa |
4 | 5.0 | 3.6 | 1.4 | 0.2 | setosa |
iris.groupby("species").size()
species
setosa 50
versicolor 50
virginica 50
dtype: int64
iris.groupby("species").idxmax()
sepal_length | sepal_width | petal_length | petal_width | |
---|---|---|---|---|
species | ||||
setosa | 14 | 15 | 24 | 43 |
versicolor | 50 | 85 | 83 | 70 |
virginica | 131 | 117 | 118 | 100 |
iris.groupby("species").agg(
sepal_min=("sepal_length", "min"),
sepal_max=("sepal_length", "max"),
petal_mean=("petal_length", "mean"),
petal_std=("petal_length", "std"),
)
sepal_min | sepal_max | petal_mean | petal_std | |
---|---|---|---|---|
species | ||||
setosa | 4.3 | 5.8 | 1.462 | 0.173664 |
versicolor | 4.9 | 7.0 | 4.260 | 0.469911 |
virginica | 4.9 | 7.9 | 5.552 | 0.551895 |
Apply a function#
train = pd.read_csv("data/titanic_train.csv")
train.head()
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
# map 'female' to 0 and 'male' to 1
train["Sex_num"] = train.Sex.map({"female": 0, "male": 1})
train.loc[0:4, ["Sex", "Sex_num"]]
Sex | Sex_num | |
---|---|---|
0 | male | 1 |
1 | female | 0 |
2 | female | 0 |
3 | female | 0 |
4 | male | 1 |
# calculate the length of each string in the 'Name' Series
train["Name_length"] = train["Name"].apply(len)
train.loc[0:4, ["Name", "Name_length"]]
Name | Name_length | |
---|---|---|
0 | Braund, Mr. Owen Harris | 23 |
1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | 51 |
2 | Heikkinen, Miss. Laina | 22 |
3 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | 44 |
4 | Allen, Mr. William Henry | 24 |
# use a string method to split the 'Name' Series at commas (returns a Series of lists)
train["Name"].str.split(",").head()
0 [Braund, Mr. Owen Harris]
1 [Cumings, Mrs. John Bradley (Florence Briggs ...
2 [Heikkinen, Miss. Laina]
3 [Futrelle, Mrs. Jacques Heath (Lily May Peel)]
4 [Allen, Mr. William Henry]
Name: Name, dtype: object
# use a lambda function
train["Name"].str.split(",").apply(lambda x: x[0]).head()
0 Braund
1 Cumings
2 Heikkinen
3 Futrelle
4 Allen
Name: Name, dtype: object
MultiIndex#
stocks = pd.read_csv("data/stocks.csv")
stocks
Date | Close | Volume | Symbol | |
---|---|---|---|---|
0 | 2016-10-03 | 31.50 | 14070500 | CSCO |
1 | 2016-10-03 | 112.52 | 21701800 | AAPL |
2 | 2016-10-03 | 57.42 | 19189500 | MSFT |
3 | 2016-10-04 | 113.00 | 29736800 | AAPL |
4 | 2016-10-04 | 57.24 | 20085900 | MSFT |
5 | 2016-10-04 | 31.35 | 18460400 | CSCO |
6 | 2016-10-05 | 57.64 | 16726400 | MSFT |
7 | 2016-10-05 | 31.59 | 11808600 | CSCO |
8 | 2016-10-05 | 113.05 | 21453100 | AAPL |
stocks.index
RangeIndex(start=0, stop=9, step=1)
ser = stocks.groupby(["Symbol", "Date"])["Close"].mean()
ser
Symbol Date
AAPL 2016-10-03 112.52
2016-10-04 113.00
2016-10-05 113.05
CSCO 2016-10-03 31.50
2016-10-04 31.35
2016-10-05 31.59
MSFT 2016-10-03 57.42
2016-10-04 57.24
2016-10-05 57.64
Name: Close, dtype: float64
ser.index
MultiIndex([('AAPL', '2016-10-03'),
('AAPL', '2016-10-04'),
('AAPL', '2016-10-05'),
('CSCO', '2016-10-03'),
('CSCO', '2016-10-04'),
('CSCO', '2016-10-05'),
('MSFT', '2016-10-03'),
('MSFT', '2016-10-04'),
('MSFT', '2016-10-05')],
names=['Symbol', 'Date'])
ser
Symbol Date
AAPL 2016-10-03 112.52
2016-10-04 113.00
2016-10-05 113.05
CSCO 2016-10-03 31.50
2016-10-04 31.35
2016-10-05 31.59
MSFT 2016-10-03 57.42
2016-10-04 57.24
2016-10-05 57.64
Name: Close, dtype: float64
ser.to_frame().pivot_table(index="Symbol", columns="Date")
Close | |||
---|---|---|---|
Date | 2016-10-03 | 2016-10-04 | 2016-10-05 |
Symbol | |||
AAPL | 112.52 | 113.00 | 113.05 |
CSCO | 31.50 | 31.35 | 31.59 |
MSFT | 57.42 | 57.24 | 57.64 |
stocks = stocks.set_index(["Symbol", "Date"])
stocks
Close | Volume | ||
---|---|---|---|
Symbol | Date | ||
CSCO | 2016-10-03 | 31.50 | 14070500 |
AAPL | 2016-10-03 | 112.52 | 21701800 |
MSFT | 2016-10-03 | 57.42 | 19189500 |
AAPL | 2016-10-04 | 113.00 | 29736800 |
MSFT | 2016-10-04 | 57.24 | 20085900 |
CSCO | 2016-10-04 | 31.35 | 18460400 |
MSFT | 2016-10-05 | 57.64 | 16726400 |
CSCO | 2016-10-05 | 31.59 | 11808600 |
AAPL | 2016-10-05 | 113.05 | 21453100 |
stocks.index
MultiIndex([('CSCO', '2016-10-03'),
('AAPL', '2016-10-03'),
('MSFT', '2016-10-03'),
('AAPL', '2016-10-04'),
('MSFT', '2016-10-04'),
('CSCO', '2016-10-04'),
('MSFT', '2016-10-05'),
('CSCO', '2016-10-05'),
('AAPL', '2016-10-05')],
names=['Symbol', 'Date'])
stocks = stocks.sort_index()
stocks
Close | Volume | ||
---|---|---|---|
Symbol | Date | ||
AAPL | 2016-10-03 | 112.52 | 21701800 |
2016-10-04 | 113.00 | 29736800 | |
2016-10-05 | 113.05 | 21453100 | |
CSCO | 2016-10-03 | 31.50 | 14070500 |
2016-10-04 | 31.35 | 18460400 | |
2016-10-05 | 31.59 | 11808600 | |
MSFT | 2016-10-03 | 57.42 | 19189500 |
2016-10-04 | 57.24 | 20085900 | |
2016-10-05 | 57.64 | 16726400 |
stocks.loc[("AAPL", "2016-10-03"), :]
Close 112.52
Volume 21701800.00
Name: (AAPL, 2016-10-03), dtype: float64
stocks.loc[("AAPL", "2016-10-03"), "Close"]
np.float64(112.52)
stocks.loc[["AAPL", "MSFT"], :]
Close | Volume | ||
---|---|---|---|
Symbol | Date | ||
AAPL | 2016-10-03 | 112.52 | 21701800 |
2016-10-04 | 113.00 | 29736800 | |
2016-10-05 | 113.05 | 21453100 | |
MSFT | 2016-10-03 | 57.42 | 19189500 |
2016-10-04 | 57.24 | 20085900 | |
2016-10-05 | 57.64 | 16726400 |
stocks.loc[(["AAPL", "MSFT"], "2016-10-03"), :]
Close | Volume | ||
---|---|---|---|
Symbol | Date | ||
AAPL | 2016-10-03 | 112.52 | 21701800 |
MSFT | 2016-10-03 | 57.42 | 19189500 |
stocks.loc[(["AAPL", "MSFT"], "2016-10-03"), "Close"]
Symbol Date
AAPL 2016-10-03 112.52
MSFT 2016-10-03 57.42
Name: Close, dtype: float64
stocks.loc[(slice(None), ["2016-10-03", "2016-10-04"]), :]
Close | Volume | ||
---|---|---|---|
Symbol | Date | ||
AAPL | 2016-10-03 | 112.52 | 21701800 |
2016-10-04 | 113.00 | 29736800 | |
CSCO | 2016-10-03 | 31.50 | 14070500 |
2016-10-04 | 31.35 | 18460400 | |
MSFT | 2016-10-03 | 57.42 | 19189500 |
2016-10-04 | 57.24 | 20085900 |
Pivot#
stocks = stocks.pivot_table(values="Close", index="Symbol", columns="Date")
stocks
Date | 2016-10-03 | 2016-10-04 | 2016-10-05 |
---|---|---|---|
Symbol | |||
AAPL | 112.52 | 113.00 | 113.05 |
CSCO | 31.50 | 31.35 | 31.59 |
MSFT | 57.42 | 57.24 | 57.64 |
Clean#
Remove duplicate rows#
user_cols = ["user_id", "age", "gender", "occupation", "zip_code"]
users = pd.read_csv(
"data/movie.user", sep="|", header=None, names=user_cols, index_col="user_id"
)
users.head()
age | gender | occupation | zip_code | |
---|---|---|---|---|
user_id | ||||
1 | 24 | M | technician | 85711 |
2 | 53 | F | other | 94043 |
3 | 23 | M | writer | 32067 |
4 | 24 | M | technician | 43537 |
5 | 33 | F | other | 15213 |
users.shape
(943, 4)
# detect duplicate zip codes: True if an item is identical to a previous item
users["zip_code"].duplicated().tail()
user_id
939 False
940 True
941 False
942 False
943 False
Name: zip_code, dtype: bool
# count the duplicate items (True becomes 1, False becomes 0)
users["zip_code"].duplicated().sum()
np.int64(148)
# examine the duplicate rows (ignoring the first occurrence)
users.loc[users.duplicated(keep="first"), :]
age | gender | occupation | zip_code | |
---|---|---|---|---|
user_id | ||||
496 | 21 | F | student | 55414 |
572 | 51 | M | educator | 20003 |
621 | 17 | M | student | 60402 |
684 | 28 | M | student | 55414 |
733 | 44 | F | other | 60630 |
805 | 27 | F | other | 20009 |
890 | 32 | M | student | 97301 |
# examine the duplicate rows (ignoring the last occurrence)
users.loc[users.duplicated(keep="last"), :]
age | gender | occupation | zip_code | |
---|---|---|---|---|
user_id | ||||
67 | 17 | M | student | 60402 |
85 | 51 | M | educator | 20003 |
198 | 21 | F | student | 55414 |
350 | 32 | M | student | 97301 |
428 | 28 | M | student | 55414 |
437 | 27 | F | other | 20009 |
460 | 44 | F | other | 60630 |
# examine the duplicate rows (including all duplicates)
users.loc[users.duplicated(keep=False), :]
age | gender | occupation | zip_code | |
---|---|---|---|---|
user_id | ||||
67 | 17 | M | student | 60402 |
85 | 51 | M | educator | 20003 |
198 | 21 | F | student | 55414 |
350 | 32 | M | student | 97301 |
428 | 28 | M | student | 55414 |
437 | 27 | F | other | 20009 |
460 | 44 | F | other | 60630 |
496 | 21 | F | student | 55414 |
572 | 51 | M | educator | 20003 |
621 | 17 | M | student | 60402 |
684 | 28 | M | student | 55414 |
733 | 44 | F | other | 60630 |
805 | 27 | F | other | 20009 |
890 | 32 | M | student | 97301 |
# drop the duplicate rows (inplace=False by default)
users.drop_duplicates(keep="first").shape
(936, 4)
users.drop_duplicates(keep="last").shape
(936, 4)
users.drop_duplicates(keep=False).shape
(929, 4)
# only consider a subset of columns when identifying duplicates
users.duplicated(subset=["age", "zip_code"]).sum()
np.int64(16)
users.drop_duplicates(subset=["age", "zip_code"]).shape
(927, 4)
Detect missing values#
ufo = pd.read_csv("data/ufo.csv")
ufo.tail()
City | Colors Reported | Shape Reported | State | Time | |
---|---|---|---|---|---|
18236 | Grant Park | NaN | TRIANGLE | IL | 12/31/2000 23:00 |
18237 | Spirit Lake | NaN | DISK | IA | 12/31/2000 23:00 |
18238 | Eagle River | NaN | NaN | WI | 12/31/2000 23:45 |
18239 | Eagle River | RED | LIGHT | WI | 12/31/2000 23:45 |
18240 | Ybor | NaN | OVAL | FL | 12/31/2000 23:59 |
ufo.isna().tail()
City | Colors Reported | Shape Reported | State | Time | |
---|---|---|---|---|---|
18236 | False | True | False | False | False |
18237 | False | True | False | False | False |
18238 | False | True | True | False | False |
18239 | False | False | False | False | False |
18240 | False | True | False | False | False |
ufo.notna().tail()
City | Colors Reported | Shape Reported | State | Time | |
---|---|---|---|---|---|
18236 | True | False | True | True | True |
18237 | True | False | True | True | True |
18238 | True | False | False | True | True |
18239 | True | True | True | True | True |
18240 | True | False | True | True | True |
# count the number of missing values in each Series
ufo.isna().sum()
City 26
Colors Reported 15359
Shape Reported 2644
State 0
Time 0
dtype: int64
# use the 'isna' Series method to filter the DataFrame rows
ufo.loc[ufo.City.isna()].head()
City | Colors Reported | Shape Reported | State | Time | |
---|---|---|---|---|---|
21 | NaN | NaN | NaN | LA | 8/15/1943 0:00 |
22 | NaN | NaN | LIGHT | LA | 8/15/1943 0:00 |
204 | NaN | NaN | DISK | CA | 7/15/1952 12:30 |
241 | NaN | BLUE | DISK | MT | 7/4/1953 14:00 |
613 | NaN | NaN | DISK | NV | 7/1/1960 12:00 |
# examine the number of rows and columns
ufo.shape
(18241, 5)
# if 'any' values are missing in a row, then drop that row
ufo.dropna(how="any").shape
(2486, 5)
# 'inplace' parameter for 'dropna' is False by default, thus rows were only dropped temporarily
ufo.shape
(18241, 5)
# if 'all' values are missing in a row, then drop that row (none are dropped in this case)
ufo.dropna(how="all").shape
(18241, 5)
# if 'any' values are missing in a row (considering only 'City' and 'Shape Reported'), then drop that row
ufo.dropna(subset=["City", "Shape Reported"], how="any").shape
(15575, 5)
# 'value_counts' does not include missing values by default
ufo["Shape Reported"].value_counts().head()
Shape Reported
LIGHT 2803
DISK 2122
TRIANGLE 1889
OTHER 1402
CIRCLE 1365
Name: count, dtype: int64
# explicitly include missing values
ufo["Shape Reported"].value_counts(dropna=False).head()
Shape Reported
LIGHT 2803
NaN 2644
DISK 2122
TRIANGLE 1889
OTHER 1402
Name: count, dtype: int64
Fill missing values#
# fill in missing values with a specified value
ufo["Shape Reported"] = ufo["Shape Reported"].fillna(value="VARIOUS")
# confirm that the missing values were filled in
ufo["Shape Reported"].value_counts().head()
Shape Reported
VARIOUS 2977
LIGHT 2803
DISK 2122
TRIANGLE 1889
OTHER 1402
Name: count, dtype: int64
# fill in missing values
ufo.fillna(value="UNKNOWN").head()
City | Colors Reported | Shape Reported | State | Time | |
---|---|---|---|---|---|
0 | Ithaca | UNKNOWN | TRIANGLE | NY | 6/1/1930 22:00 |
1 | Willingboro | UNKNOWN | OTHER | NJ | 6/30/1930 20:00 |
2 | Holyoke | UNKNOWN | OVAL | CO | 2/15/1931 14:00 |
3 | Abilene | UNKNOWN | DISK | KS | 6/1/1931 13:00 |
4 | New York Worlds Fair | UNKNOWN | LIGHT | NY | 4/18/1933 19:00 |
# fill missing values using "backward fill" strategy (doesn't affect the DataFrame since inplace=False)
ufo.fillna(method="bfill").tail()
/tmp/ipykernel_1846/278037071.py:2: FutureWarning: DataFrame.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.
ufo.fillna(method="bfill").tail()
City | Colors Reported | Shape Reported | State | Time | |
---|---|---|---|---|---|
18236 | Grant Park | RED | TRIANGLE | IL | 12/31/2000 23:00 |
18237 | Spirit Lake | RED | DISK | IA | 12/31/2000 23:00 |
18238 | Eagle River | RED | VARIOUS | WI | 12/31/2000 23:45 |
18239 | Eagle River | RED | LIGHT | WI | 12/31/2000 23:45 |
18240 | Ybor | NaN | OVAL | FL | 12/31/2000 23:59 |
# compare with "forward fill" strategy (doesn't affect the DataFrame since inplace=False)
ufo.fillna(method="ffill").tail()
/tmp/ipykernel_1846/2160560122.py:2: FutureWarning: DataFrame.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.
ufo.fillna(method="ffill").tail()
City | Colors Reported | Shape Reported | State | Time | |
---|---|---|---|---|---|
18236 | Grant Park | RED | TRIANGLE | IL | 12/31/2000 23:00 |
18237 | Spirit Lake | RED | DISK | IA | 12/31/2000 23:00 |
18238 | Eagle River | RED | VARIOUS | WI | 12/31/2000 23:45 |
18239 | Eagle River | RED | LIGHT | WI | 12/31/2000 23:45 |
18240 | Ybor | RED | OVAL | FL | 12/31/2000 23:59 |
Where & Mask#
train = pd.read_csv("data/titanic_train.csv")
train.head()
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
cond = train["Sex"] == "male"
# replace the values unsatisfied the condition, default is NaN
train["Sex"] = train["Sex"].where(cond, other="Female")
train.head()
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | Female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | Female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | Female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
train["quality"] = ""
cond1 = train["Sex"] == "male"
cond2 = train["Age"] > 25
# display the values unsatisfied the condition
train["quality"] = train["quality"].where(cond1 & cond2, other="low")
train.head()
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | quality | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S | low |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | Female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C | low |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | Female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S | low |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | Female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S | low |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
train["quality"] = ""
cond1 = train["Sex"] == "male"
cond2 = train["Age"] > 25
# mask the values unsatisfied the condition
train["quality"] = train["quality"].mask(cond1 & cond2, other="low")
train.head()
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | quality | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S | |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | Female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C | |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | Female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S | |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | Female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S | |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S | low |
Outliers (clip)#
drinks["wine_servings"].clip(10, 300)
0 10
1 54
2 14
3 300
4 45
...
188 10
189 10
190 10
191 10
192 10
Name: wine_servings, Length: 193, dtype: int64
String & Time#
Change the data type#
drinks = pd.read_csv("data/drinks.csv")
drinks.head()
country | beer_servings | spirit_servings | wine_servings | total_litres_of_pure_alcohol | continent | |
---|---|---|---|---|---|---|
0 | Afghanistan | 0 | 0 | 0 | 0.0 | Asia |
1 | Albania | 89 | 132 | 54 | 4.9 | Europe |
2 | Algeria | 25 | 0 | 14 | 0.7 | Africa |
3 | Andorra | 245 | 138 | 312 | 12.4 | Europe |
4 | Angola | 217 | 57 | 45 | 5.9 | Africa |
# examine the data type of each Series
drinks.dtypes
country object
beer_servings int64
spirit_servings int64
wine_servings int64
total_litres_of_pure_alcohol float64
continent object
dtype: object
drinks.convert_dtypes().dtypes
country string[python]
beer_servings Int64
spirit_servings Int64
wine_servings Int64
total_litres_of_pure_alcohol Float64
continent string[python]
dtype: object
# change the data type of an existing Series
drinks["beer_servings"] = drinks["beer_servings"].astype(float)
drinks.dtypes
country object
beer_servings float64
spirit_servings int64
wine_servings int64
total_litres_of_pure_alcohol float64
continent object
dtype: object
Use string methods#
orders = pd.read_csv("data/chipotle.tsv", sep="\t")
orders.head()
order_id | quantity | item_name | choice_description | item_price | |
---|---|---|---|---|---|
0 | 1 | 1 | Chips and Fresh Tomato Salsa | NaN | $2.39 |
1 | 1 | 1 | Izze | [Clementine] | $3.39 |
2 | 1 | 1 | Nantucket Nectar | [Apple] | $3.39 |
3 | 1 | 1 | Chips and Tomatillo-Green Chili Salsa | NaN | $2.39 |
4 | 2 | 2 | Chicken Bowl | [Tomatillo-Red Chili Salsa (Hot), [Black Beans... | $16.98 |
# string methods for pandas Series are accessed via 'str'
orders.item_name.str.upper().head()
0 CHIPS AND FRESH TOMATO SALSA
1 IZZE
2 NANTUCKET NECTAR
3 CHIPS AND TOMATILLO-GREEN CHILI SALSA
4 CHICKEN BOWL
Name: item_name, dtype: object
# use the boolean Series to filter the DataFrame
orders[orders.item_name.str.contains("Chicken")].head()
order_id | quantity | item_name | choice_description | item_price | |
---|---|---|---|---|---|
4 | 2 | 2 | Chicken Bowl | [Tomatillo-Red Chili Salsa (Hot), [Black Beans... | $16.98 |
5 | 3 | 1 | Chicken Bowl | [Fresh Tomato Salsa (Mild), [Rice, Cheese, Sou... | $10.98 |
11 | 6 | 1 | Chicken Crispy Tacos | [Roasted Chili Corn Salsa, [Fajita Vegetables,... | $8.75 |
12 | 6 | 1 | Chicken Soft Tacos | [Roasted Chili Corn Salsa, [Rice, Black Beans,... | $8.75 |
13 | 7 | 1 | Chicken Bowl | [Fresh Tomato Salsa, [Fajita Vegetables, Rice,... | $11.25 |
# string methods can be chained together
orders.choice_description.str.replace("[", "").str.replace("]", "").head()
0 NaN
1 Clementine
2 Apple
3 NaN
4 Tomatillo-Red Chili Salsa (Hot), Black Beans, ...
Name: choice_description, dtype: object
# many pandas string methods support regular expressions (regex)
orders.choice_description.str.replace(r"[\[\]]", "").head()
0 NaN
1 [Clementine]
2 [Apple]
3 NaN
4 [Tomatillo-Red Chili Salsa (Hot), [Black Beans...
Name: choice_description, dtype: object
Dates & Times#
# read a dataset of UFO reports into a DataFrame
ufo = pd.read_csv("data/ufo.csv")
ufo.head()
City | Colors Reported | Shape Reported | State | Time | |
---|---|---|---|---|---|
0 | Ithaca | NaN | TRIANGLE | NY | 6/1/1930 22:00 |
1 | Willingboro | NaN | OTHER | NJ | 6/30/1930 20:00 |
2 | Holyoke | NaN | OVAL | CO | 2/15/1931 14:00 |
3 | Abilene | NaN | DISK | KS | 6/1/1931 13:00 |
4 | New York Worlds Fair | NaN | LIGHT | NY | 4/18/1933 19:00 |
# 'Time' is currently stored as a string
ufo.dtypes
City object
Colors Reported object
Shape Reported object
State object
Time object
dtype: object
# convert 'Time' to datetime format
ufo["Time"] = pd.to_datetime(ufo.Time)
ufo.head()
City | Colors Reported | Shape Reported | State | Time | |
---|---|---|---|---|---|
0 | Ithaca | NaN | TRIANGLE | NY | 1930-06-01 22:00:00 |
1 | Willingboro | NaN | OTHER | NJ | 1930-06-30 20:00:00 |
2 | Holyoke | NaN | OVAL | CO | 1931-02-15 14:00:00 |
3 | Abilene | NaN | DISK | KS | 1931-06-01 13:00:00 |
4 | New York Worlds Fair | NaN | LIGHT | NY | 1933-04-18 19:00:00 |
ufo.dtypes
City object
Colors Reported object
Shape Reported object
State object
Time datetime64[ns]
dtype: object
ufo = ufo.set_index("Time")
ufo.head()
City | Colors Reported | Shape Reported | State | |
---|---|---|---|---|
Time | ||||
1930-06-01 22:00:00 | Ithaca | NaN | TRIANGLE | NY |
1930-06-30 20:00:00 | Willingboro | NaN | OTHER | NJ |
1931-02-15 14:00:00 | Holyoke | NaN | OVAL | CO |
1931-06-01 13:00:00 | Abilene | NaN | DISK | KS |
1933-04-18 19:00:00 | New York Worlds Fair | NaN | LIGHT | NY |
ufo.at_time("15:00")
City | Colors Reported | Shape Reported | State | |
---|---|---|---|---|
Time | ||||
1943-06-01 15:00:00 | Nebraska | NaN | DISK | NE |
1947-07-15 15:00:00 | Clintwood | NaN | CYLINDER | VA |
1947-07-15 15:00:00 | Hazelton | NaN | DISK | ID |
1949-04-10 15:00:00 | Seattle | NaN | DISK | WA |
1949-06-30 15:00:00 | Knoxville | NaN | OTHER | TN |
... | ... | ... | ... | ... |
2000-10-12 15:00:00 | Mountain City | NaN | DISK | TN |
2000-10-28 15:00:00 | Kintnersville | ORANGE | OTHER | PA |
2000-11-20 15:00:00 | St. Joseph | NaN | EGG | MO |
2000-12-10 15:00:00 | Marshfield | NaN | LIGHT | WI |
2000-12-27 15:00:00 | Kea'au | NaN | SPHERE | HI |
254 rows × 4 columns
ufo.between_time("09:45", "12:00")
City | Colors Reported | Shape Reported | State | |
---|---|---|---|---|
Time | ||||
1941-07-02 11:30:00 | Forest Home | NaN | CIRCLE | CA |
1943-10-15 11:00:00 | Owensboro | NaN | RECTANGLE | KY |
1944-01-01 10:00:00 | Wilderness | NaN | DISK | WV |
1944-01-01 12:00:00 | San Diego | NaN | CIGAR | CA |
1944-01-01 12:00:00 | Wilderness | NaN | DISK | WV |
... | ... | ... | ... | ... |
2000-12-27 10:00:00 | Santa Paula | NaN | LIGHT | CA |
2000-12-28 11:00:00 | Cerrilillo | NaN | NaN | NM |
2000-12-28 12:00:00 | Mansfield | NaN | FLASH | TX |
2000-12-30 10:00:00 | Simi Valley | NaN | FORMATION | CA |
2000-12-31 11:45:00 | Chicago | NaN | DISK | IL |
781 rows × 4 columns
Category data#
drinks = pd.read_csv("data/drinks.csv")
drinks.head()
country | beer_servings | spirit_servings | wine_servings | total_litres_of_pure_alcohol | continent | |
---|---|---|---|---|---|---|
0 | Afghanistan | 0 | 0 | 0 | 0.0 | Asia |
1 | Albania | 89 | 132 | 54 | 4.9 | Europe |
2 | Algeria | 25 | 0 | 14 | 0.7 | Africa |
3 | Andorra | 245 | 138 | 312 | 12.4 | Europe |
4 | Angola | 217 | 57 | 45 | 5.9 | Africa |
drinks = drinks.convert_dtypes()
drinks.dtypes
country string[python]
beer_servings Int64
spirit_servings Int64
wine_servings Int64
total_litres_of_pure_alcohol Float64
continent string[python]
dtype: object
# calculate the memory usage for each Series (in bytes)
drinks.memory_usage(deep=True)
Index 132
country 12588
beer_servings 1737
spirit_servings 1737
wine_servings 1737
total_litres_of_pure_alcohol 1737
continent 12332
dtype: int64
# use the 'category' data type to store the 'continent' strings as integers
drinks["continent"] = drinks["continent"].astype("category")
drinks.dtypes
country string[python]
beer_servings Int64
spirit_servings Int64
wine_servings Int64
total_litres_of_pure_alcohol Float64
continent category
dtype: object
# strings are now encoded (0 means 'Africa', 1 means 'Asia', 2 means 'Europe', etc.)
drinks["continent"].cat.codes.head()
0 1
1 2
2 0
3 2
4 0
dtype: int8
# memory usage has been drastically reduced
drinks.memory_usage(deep=True)
Index 132
country 12588
beer_servings 1737
spirit_servings 1737
wine_servings 1737
total_litres_of_pure_alcohol 1737
continent 756
dtype: int64
# repeat this process for the 'country' Series
drinks["country"] = drinks["country"].astype("category")
drinks.memory_usage(deep=True)
Index 132
country 17142
beer_servings 1737
spirit_servings 1737
wine_servings 1737
total_litres_of_pure_alcohol 1737
continent 756
dtype: int64
# memory usage increased because we created 193 categories
drinks["country"].cat.categories
Index(['Afghanistan', 'Albania', 'Algeria', 'Andorra', 'Angola',
'Antigua & Barbuda', 'Argentina', 'Armenia', 'Australia', 'Austria',
...
'United Arab Emirates', 'United Kingdom', 'Uruguay', 'Uzbekistan',
'Vanuatu', 'Venezuela', 'Vietnam', 'Yemen', 'Zambia', 'Zimbabwe'],
dtype='string', length=193)
Dummy variables#
train = pd.read_csv("data/titanic_train.csv")
train.head()
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
# use 'get_dummies' to create one column for every possible value
pd.get_dummies(train.Sex).head()
female | male | |
---|---|---|
0 | False | True |
1 | True | False |
2 | True | False |
3 | True | False |
4 | False | True |
# drop the first dummy variable ('female') using the 'iloc' method
# add a prefix to identify the source of the dummy variables
pd.get_dummies(train.Sex, prefix="Sex").iloc[:, 1:].head()
Sex_male | |
---|---|
0 | True |
1 | False |
2 | False |
3 | False |
4 | True |
# use 'get_dummies' with a feature that has 3 possible values
pd.get_dummies(train.Embarked, prefix="Embarked").head(10)
Embarked_C | Embarked_Q | Embarked_S | |
---|---|---|---|
0 | False | False | True |
1 | True | False | False |
2 | False | False | True |
3 | False | False | True |
4 | False | False | True |
5 | False | True | False |
6 | False | False | True |
7 | False | False | True |
8 | False | False | True |
9 | True | False | False |
# drop the first dummy variable ('C')
pd.get_dummies(train.Embarked, prefix="Embarked").iloc[:, 1:].head(10)
Embarked_Q | Embarked_S | |
---|---|---|
0 | False | True |
1 | False | False |
2 | False | True |
3 | False | True |
4 | False | True |
5 | True | False |
6 | False | True |
7 | False | True |
8 | False | True |
9 | False | False |
# save the DataFrame of dummy variables and concatenate them to the original DataFrame
embarked_dummies = pd.get_dummies(train.Embarked, prefix="Embarked").iloc[:, 1:]
train = pd.concat([train, embarked_dummies], axis=1)
train.head()
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | Embarked_Q | Embarked_S | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S | False | True |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C | False | False |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S | False | True |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S | False | True |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S | False | True |
train = pd.read_csv("data/titanic_train.csv")
train.head()
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
# pass the DataFrame to 'get_dummies' and specify which columns to dummy (it drops the original columns)
pd.get_dummies(train, columns=["Sex", "Embarked"]).head()
PassengerId | Survived | Pclass | Name | Age | SibSp | Parch | Ticket | Fare | Cabin | Sex_female | Sex_male | Embarked_C | Embarked_Q | Embarked_S | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | False | True | False | False | True |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | True | False | True | False | False |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | True | False | False | False | True |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | True | False | False | False | True |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | False | True | False | False | True |
# use the 'drop_first' parameter (new in pandas 0.18) to drop the first dummy variable for each feature
pd.get_dummies(train, columns=["Sex", "Embarked"], drop_first=True).head()
PassengerId | Survived | Pclass | Name | Age | SibSp | Parch | Ticket | Fare | Cabin | Sex_male | Embarked_Q | Embarked_S | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | True | False | True |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | False | False | False |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | False | False | True |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | False | False | True |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | True | False | True |
Cardinal data#
# create a small DataFrame from a dictionary
small = pd.DataFrame(
{"ID": [100, 101, 102, 103], "quality": ["good", "very good", "good", "excellent"]}
)
small.head()
ID | quality | |
---|---|---|
0 | 100 | good |
1 | 101 | very good |
2 | 102 | good |
3 | 103 | excellent |
# sort the DataFrame by the 'quality' Series (alphabetical order)
small.sort_values("quality")
ID | quality | |
---|---|---|
3 | 103 | excellent |
0 | 100 | good |
2 | 102 | good |
1 | 101 | very good |
from pandas.api.types import CategoricalDtype
# define a logical ordering for the categories
cats = ["good", "very good", "excellent"]
cat_type = CategoricalDtype(categories=cats, ordered=True)
small["quality"] = small.quality.astype(cat_type)
small.quality
0 good
1 very good
2 good
3 excellent
Name: quality, dtype: category
Categories (3, object): ['good' < 'very good' < 'excellent']
# sort the DataFrame by the 'quality' Series (logical order)
small.sort_values("quality")
ID | quality | |
---|---|---|
0 | 100 | good |
2 | 102 | good |
1 | 101 | very good |
3 | 103 | excellent |
# comparison operators work with ordered categories
small.loc[small.quality > "good", :]
ID | quality | |
---|---|---|
1 | 101 | very good |
3 | 103 | excellent |
Sample rows#
# use the 'random_state' parameter for reproducibility
ufo.sample(n=3, random_state=42)
City | Colors Reported | Shape Reported | State | |
---|---|---|---|---|
Time | ||||
1952-09-15 14:00:00 | Norridgewock | NaN | DISK | ME |
1998-10-01 21:15:00 | Ipava | NaN | TRIANGLE | IL |
2000-11-13 22:00:00 | Ellinwood | NaN | FIREBALL | KS |
# sample 75% of the DataFrame's rows without replacement
train = ufo.sample(frac=0.75, random_state=99)
# store the remaining 25% of the rows in another DataFrame
test = ufo.loc[~ufo.index.isin(train.index), :]
test.head()
City | Colors Reported | Shape Reported | State | |
---|---|---|---|---|
Time | ||||
1933-04-18 19:00:00 | New York Worlds Fair | NaN | LIGHT | NY |
1934-09-15 15:30:00 | Valley City | NaN | DISK | ND |
1936-10-15 17:00:00 | Eklutna | NaN | CIGAR | AK |
1939-06-01 20:00:00 | Waterloo | NaN | FIREBALL | AL |
1939-07-07 02:00:00 | Keokuk | NaN | OVAL | IA |