清洗和预处理

清洗和预处理#

import pandas as pd

Explore#

drinks = pd.read_csv("data/drinks.csv")
drinks.head()

	country	beer_servings	spirit_servings	wine_servings	total_litres_of_pure_alcohol	continent
0	Afghanistan	0	0	0	0.0	Asia
1	Albania	89	132	54	4.9	Europe
2	Algeria	25	0	14	0.7	Africa
3	Andorra	245	138	312	12.4	Europe
4	Angola	217	57	45	5.9	Africa

Max & Min#

drinks["beer_servings"].nlargest(10)

  376
   361
   347
   346
   343
  343
  333
   313
  306
  297
Name: beer_servings, dtype: int64

drinks["beer_servings"].idxmax()

Count#

drinks["continent"].value_counts()

continent
Africa           53
Europe           45
Asia             44
North America    23
Oceania          16
South America    12
Name: count, dtype: int64

drinks["continent"].value_counts(normalize=True)

continent
Africa           0.274611
Europe           0.233161
Asia             0.227979
North America    0.119171
Oceania          0.082902
South America    0.062176
Name: proportion, dtype: float64

# access the Series index
drinks["continent"].value_counts().index

Index(['Africa', 'Europe', 'Asia', 'North America', 'Oceania',
       'South America'],
      dtype='object', name='continent')

# access the Series values
drinks["continent"].value_counts().array

<NumpyExtensionArray>
[np.int64(53), np.int64(45), np.int64(44), np.int64(23), np.int64(16),
 np.int64(12)]
Length: 6, dtype: int64

# any Series can be sorted by its values
drinks.continent.value_counts().sort_values()

continent
South America    12
Oceania          16
North America    23
Asia             44
Europe           45
Africa           53
Name: count, dtype: int64

# any Series can also be sorted by its index
drinks["continent"].value_counts().sort_index()

continent
Africa           53
Asia             44
Europe           45
North America    23
Oceania          16
South America    12
Name: count, dtype: int64

drinks["continent"].unique()

array(['Asia', 'Europe', 'Africa', 'North America', 'South America',
       'Oceania'], dtype=object)

drinks["continent"].nunique()

Sort#

movies = pd.read_csv("data/imdb_1000.csv")
movies.head()

	star_rating	title	content_rating	genre	duration	actors_list
0	9.3	The Shawshank Redemption	R	Crime	142	[u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt...
1	9.2	The Godfather	R	Crime	175	[u'Marlon Brando', u'Al Pacino', u'James Caan']
2	9.1	The Godfather: Part II	R	Crime	200	[u'Al Pacino', u'Robert De Niro', u'Robert Duv...
3	9.0	The Dark Knight	PG-13	Action	152	[u'Christian Bale', u'Heath Ledger', u'Aaron E...
4	8.9	Pulp Fiction	R	Crime	154	[u'John Travolta', u'Uma Thurman', u'Samuel L....

# sort the 'title' Series in ascending order (returns a Series)
movies.title.sort_values().head()

   (500) Days of Summer
             12 Angry Men
       12 Years a Slave
              127 Hours
  2001: A Space Odyssey
Name: title, dtype: object

# sort in descending order instead
movies.title.sort_values(ascending=False).head()

             [Rec]
              Zulu
        Zombieland
            Zodiac
  Zero Dark Thirty
Name: title, dtype: object

# sort the entire DataFrame by the 'title' Series (returns a DataFrame)
movies.sort_values("title").head()

	star_rating	title	content_rating	genre	duration	actors_list
542	7.8	(500) Days of Summer	PG-13	Comedy	95	[u'Zooey Deschanel', u'Joseph Gordon-Levitt', ...
5	8.9	12 Angry Men	NOT RATED	Drama	96	[u'Henry Fonda', u'Lee J. Cobb', u'Martin Bals...
201	8.1	12 Years a Slave	R	Biography	134	[u'Chiwetel Ejiofor', u'Michael Kenneth Willia...
698	7.6	127 Hours	R	Adventure	94	[u'James Franco', u'Amber Tamblyn', u'Kate Mara']
110	8.3	2001: A Space Odyssey	G	Mystery	160	[u'Keir Dullea', u'Gary Lockwood', u'William S...

# sort in descending order instead
movies.sort_values("title", ascending=False).head()

	star_rating	title	content_rating	genre	duration	actors_list
864	7.5	[Rec]	R	Horror	78	[u'Manuela Velasco', u'Ferran Terraza', u'Jorg...
526	7.8	Zulu	UNRATED	Drama	138	[u'Stanley Baker', u'Jack Hawkins', u'Ulla Jac...
615	7.7	Zombieland	R	Comedy	88	[u'Jesse Eisenberg', u'Emma Stone', u'Woody Ha...
677	7.7	Zodiac	R	Crime	157	[u'Jake Gyllenhaal', u'Robert Downey Jr.', u'M...
955	7.4	Zero Dark Thirty	R	Drama	157	[u'Jessica Chastain', u'Joel Edgerton', u'Chri...

# sort the DataFrame first by 'content_rating', then by 'duration'
movies.sort_values(["content_rating", "duration"]).head()

	star_rating	title	content_rating	genre	duration	actors_list
713	7.6	The Jungle Book	APPROVED	Animation	78	[u'Phil Harris', u'Sebastian Cabot', u'Louis P...
513	7.8	Invasion of the Body Snatchers	APPROVED	Horror	80	[u'Kevin McCarthy', u'Dana Wynter', u'Larry Ga...
272	8.1	The Killing	APPROVED	Crime	85	[u'Sterling Hayden', u'Coleen Gray', u'Vince E...
703	7.6	Dracula	APPROVED	Horror	85	[u'Bela Lugosi', u'Helen Chandler', u'David Ma...
612	7.7	A Hard Day's Night	APPROVED	Comedy	87	[u'John Lennon', u'Paul McCartney', u'George H...

Correlation#

pd.crosstab(drinks["country"], drinks["continent"])

continent	Africa	Asia	Europe	North America	Oceania	South America
country
Afghanistan	0	1	0	0	0	0
Albania	0	0	1	0	0	0
Algeria	1	0	0	0	0	0
Andorra	0	0	1	0	0	0
Angola	1	0	0	0	0	0
...	...	...	...	...	...	...
Venezuela	0	0	0	0	0	1
Vietnam	0	1	0	0	0	0
Yemen	0	1	0	0	0	0
Zambia	1	0	0	0	0	0
Zimbabwe	1	0	0	0	0	0

193 rows × 6 columns

Transform#

Group#

drinks = pd.read_csv("data/drinks.csv")
drinks.head()

	country	beer_servings	spirit_servings	wine_servings	total_litres_of_pure_alcohol	continent
0	Afghanistan	0	0	0	0.0	Asia
1	Albania	89	132	54	4.9	Europe
2	Algeria	25	0	14	0.7	Africa
3	Andorra	245	138	312	12.4	Europe
4	Angola	217	57	45	5.9	Africa

drinks.groupby("continent").nth(5)

	country	beer_servings	spirit_servings	wine_servings	total_litres_of_pure_alcohol	continent
15	Belarus	142	373	42	14.4	Europe
27	Burundi	88	0	0	6.3	Africa
30	Cambodia	57	65	1	2.2	Asia
41	Costa Rica	149	87	11	4.4	North America
52	Ecuador	162	74	3	4.2	South America
110	Micronesia	62	50	18	2.3	Oceania

# calculate the mean beer servings for each continent
drinks.groupby("continent")["beer_servings"].mean()

continent
Africa            61.471698
Asia              37.045455
Europe           193.777778
North America    145.434783
Oceania           89.687500
South America    175.083333
Name: beer_servings, dtype: float64

# other aggregation functions (such as 'max') can also be used with groupby
drinks.groupby("continent")["beer_servings"].max()

continent
Africa           376
Asia             247
Europe           361
North America    285
Oceania          306
South America    333
Name: beer_servings, dtype: int64

# multiple aggregation functions can be applied simultaneously
drinks.groupby("continent")["beer_servings"].agg(["count", "mean", "min", "max"])

	count	mean	min	max
continent
Africa	53	61.471698	0	376
Asia	44	37.045455	0	247
Europe	45	193.777778	0	361
North America	23	145.434783	1	285
Oceania	16	89.687500	0	306
South America	12	175.083333	93	333

Apply#

drinks["average_wine_servings"] = drinks.groupby("continent")[
    "wine_servings"
].transform("mean")
drinks.head()

	country	beer_servings	spirit_servings	wine_servings	total_litres_of_pure_alcohol	continent	average_wine_servings
0	Afghanistan	0	0	0	0.0	Asia	9.068182
1	Albania	89	132	54	4.9	Europe	142.222222
2	Algeria	25	0	14	0.7	Africa	16.264151
3	Andorra	245	138	312	12.4	Europe	142.222222
4	Angola	217	57	45	5.9	Africa	16.264151

drinks["average_wine_servings"] = drinks["average_wine_servings"].apply(
    lambda x: x / 100
)
drinks.head()

	country	beer_servings	spirit_servings	wine_servings	total_litres_of_pure_alcohol	continent	average_wine_servings
0	Afghanistan	0	0	0	0.0	Asia	0.090682
1	Albania	89	132	54	4.9	Europe	1.422222
2	Algeria	25	0	14	0.7	Africa	0.162642
3	Andorra	245	138	312	12.4	Europe	1.422222
4	Angola	217	57	45	5.9	Africa	0.162642

iris = pd.read_csv("data/iris.csv")
iris.head()

	sepal_length	sepal_width	petal_length	petal_width	species
0	5.1	3.5	1.4	0.2	setosa
1	4.9	3.0	1.4	0.2	setosa
2	4.7	3.2	1.3	0.2	setosa
3	4.6	3.1	1.5	0.2	setosa
4	5.0	3.6	1.4	0.2	setosa

iris.groupby("species").size()

species
setosa        50
versicolor    50
virginica     50
dtype: int64

iris.groupby("species").idxmax()

	sepal_length	sepal_width	petal_length	petal_width
species
setosa	14	15	24	43
versicolor	50	85	83	70
virginica	131	117	118	100

iris.groupby("species").agg(
    sepal_min=("sepal_length", "min"),
    sepal_max=("sepal_length", "max"),
    petal_mean=("petal_length", "mean"),
    petal_std=("petal_length", "std"),
)

	sepal_min	sepal_max	petal_mean	petal_std
species
setosa	4.3	5.8	1.462	0.173664
versicolor	4.9	7.0	4.260	0.469911
virginica	4.9	7.9	5.552	0.551895

Apply a function#

train = pd.read_csv("data/titanic_train.csv")
train.head()

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Ticket	Fare	Cabin	Embarked
0	1	0	3	Braund, Mr. Owen Harris	male	22.0	1	A/5 21171	7.2500	NaN	S
1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38.0	1	PC 17599	71.2833	C85	C
2	3	1	3	Heikkinen, Miss. Laina	female	26.0	0	STON/O2. 3101282	7.9250	NaN	S
3	4	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	female	35.0	1	113803	53.1000	C123	S
4	5	0	3	Allen, Mr. William Henry	male	35.0	0	373450	8.0500	NaN	S

# map 'female' to 0 and 'male' to 1
train["Sex_num"] = train.Sex.map({"female": 0, "male": 1})
train.loc[0:4, ["Sex", "Sex_num"]]

	Sex	Sex_num
0	male	1
1	female	0
2	female	0
3	female	0
4	male	1

# calculate the length of each string in the 'Name' Series
train["Name_length"] = train["Name"].apply(len)
train.loc[0:4, ["Name", "Name_length"]]

	Name	Name_length
0	Braund, Mr. Owen Harris	23
1	Cumings, Mrs. John Bradley (Florence Briggs Th...	51
2	Heikkinen, Miss. Laina	22
3	Futrelle, Mrs. Jacques Heath (Lily May Peel)	44
4	Allen, Mr. William Henry	24

# use a string method to split the 'Name' Series at commas (returns a Series of lists)
train["Name"].str.split(",").head()

                         [Braund,  Mr. Owen Harris]
  [Cumings,  Mrs. John Bradley (Florence Briggs ...
                          [Heikkinen,  Miss. Laina]
    [Futrelle,  Mrs. Jacques Heath (Lily May Peel)]
                        [Allen,  Mr. William Henry]
Name: Name, dtype: object

#  use a lambda function
train["Name"].str.split(",").apply(lambda x: x[0]).head()

     Braund
    Cumings
  Heikkinen
   Futrelle
      Allen
Name: Name, dtype: object

MultiIndex#

stocks = pd.read_csv("data/stocks.csv")
stocks

	Date	Close	Volume	Symbol
0	2016-10-03	31.50	14070500	CSCO
1	2016-10-03	112.52	21701800	AAPL
2	2016-10-03	57.42	19189500	MSFT
3	2016-10-04	113.00	29736800	AAPL
4	2016-10-04	57.24	20085900	MSFT
5	2016-10-04	31.35	18460400	CSCO
6	2016-10-05	57.64	16726400	MSFT
7	2016-10-05	31.59	11808600	CSCO
8	2016-10-05	113.05	21453100	AAPL

stocks.index

RangeIndex(start=0, stop=9, step=1)

ser = stocks.groupby(["Symbol", "Date"])["Close"].mean()
ser

Symbol  Date      
AAPL    2016-10-03    112.52
        2016-10-04    113.00
        2016-10-05    113.05
CSCO    2016-10-03     31.50
        2016-10-04     31.35
        2016-10-05     31.59
MSFT    2016-10-03     57.42
        2016-10-04     57.24
        2016-10-05     57.64
Name: Close, dtype: float64

ser.index

MultiIndex([('AAPL', '2016-10-03'),
            ('AAPL', '2016-10-04'),
            ('AAPL', '2016-10-05'),
            ('CSCO', '2016-10-03'),
            ('CSCO', '2016-10-04'),
            ('CSCO', '2016-10-05'),
            ('MSFT', '2016-10-03'),
            ('MSFT', '2016-10-04'),
            ('MSFT', '2016-10-05')],
           names=['Symbol', 'Date'])

ser

Symbol  Date      
AAPL    2016-10-03    112.52
        2016-10-04    113.00
        2016-10-05    113.05
CSCO    2016-10-03     31.50
        2016-10-04     31.35
        2016-10-05     31.59
MSFT    2016-10-03     57.42
        2016-10-04     57.24
        2016-10-05     57.64
Name: Close, dtype: float64

ser.to_frame().pivot_table(index="Symbol", columns="Date")

	Close
Date	2016-10-03	2016-10-04	2016-10-05
Symbol
AAPL	112.52	113.00	113.05
CSCO	31.50	31.35	31.59
MSFT	57.42	57.24	57.64

stocks = stocks.set_index(["Symbol", "Date"])
stocks

		Close	Volume
Symbol	Date
CSCO	2016-10-03	31.50	14070500
AAPL	2016-10-03	112.52	21701800
MSFT	2016-10-03	57.42	19189500
AAPL	2016-10-04	113.00	29736800
MSFT	2016-10-04	57.24	20085900
CSCO	2016-10-04	31.35	18460400
MSFT	2016-10-05	57.64	16726400
CSCO	2016-10-05	31.59	11808600
AAPL	2016-10-05	113.05	21453100

stocks.index

MultiIndex([('CSCO', '2016-10-03'),
            ('AAPL', '2016-10-03'),
            ('MSFT', '2016-10-03'),
            ('AAPL', '2016-10-04'),
            ('MSFT', '2016-10-04'),
            ('CSCO', '2016-10-04'),
            ('MSFT', '2016-10-05'),
            ('CSCO', '2016-10-05'),
            ('AAPL', '2016-10-05')],
           names=['Symbol', 'Date'])

stocks = stocks.sort_index()
stocks

		Close	Volume
Symbol	Date
AAPL	2016-10-03	112.52	21701800
	2016-10-04	113.00	29736800
	2016-10-05	113.05	21453100
CSCO	2016-10-03	31.50	14070500
	2016-10-04	31.35	18460400
	2016-10-05	31.59	11808600
MSFT	2016-10-03	57.42	19189500
	2016-10-04	57.24	20085900
	2016-10-05	57.64	16726400

stocks.loc[("AAPL", "2016-10-03"), :]

Close          112.52
Volume    21701800.00
Name: (AAPL, 2016-10-03), dtype: float64

stocks.loc[("AAPL", "2016-10-03"), "Close"]

np.float64(112.52)

stocks.loc[["AAPL", "MSFT"], :]

		Close	Volume
Symbol	Date
AAPL	2016-10-03	112.52	21701800
	2016-10-04	113.00	29736800
	2016-10-05	113.05	21453100
MSFT	2016-10-03	57.42	19189500
	2016-10-04	57.24	20085900
	2016-10-05	57.64	16726400

stocks.loc[(["AAPL", "MSFT"], "2016-10-03"), :]

		Close	Volume
Symbol	Date
AAPL	2016-10-03	112.52	21701800
MSFT	2016-10-03	57.42	19189500

stocks.loc[(["AAPL", "MSFT"], "2016-10-03"), "Close"]

Symbol  Date      
AAPL    2016-10-03    112.52
MSFT    2016-10-03     57.42
Name: Close, dtype: float64

stocks.loc[(slice(None), ["2016-10-03", "2016-10-04"]), :]

		Close	Volume
Symbol	Date
AAPL	2016-10-03	112.52	21701800
AAPL	2016-10-04	113.00	29736800
CSCO	2016-10-03	31.50	14070500
CSCO	2016-10-04	31.35	18460400
MSFT	2016-10-03	57.42	19189500
MSFT	2016-10-04	57.24	20085900

Pivot#

stocks = stocks.pivot_table(values="Close", index="Symbol", columns="Date")
stocks

Date	2016-10-03	2016-10-04	2016-10-05
Symbol
AAPL	112.52	113.00	113.05
CSCO	31.50	31.35	31.59
MSFT	57.42	57.24	57.64

Clean#

Remove duplicate rows#

user_cols = ["user_id", "age", "gender", "occupation", "zip_code"]
users = pd.read_csv(
    "data/movie.user", sep="|", header=None, names=user_cols, index_col="user_id"
)
users.head()

	age	gender	occupation	zip_code
user_id
1	24	M	technician	85711
2	53	F	other	94043
3	23	M	writer	32067
4	24	M	technician	43537
5	33	F	other	15213

users.shape

(943, 4)

# detect duplicate zip codes: True if an item is identical to a previous item
users["zip_code"].duplicated().tail()

user_id
  False
   True
  False
  False
  False
Name: zip_code, dtype: bool

# count the duplicate items (True becomes 1, False becomes 0)
users["zip_code"].duplicated().sum()

np.int64(148)

# examine the duplicate rows (ignoring the first occurrence)
users.loc[users.duplicated(keep="first"), :]

	age	gender	occupation	zip_code
user_id
496	21	F	student	55414
572	51	M	educator	20003
621	17	M	student	60402
684	28	M	student	55414
733	44	F	other	60630
805	27	F	other	20009
890	32	M	student	97301

# examine the duplicate rows (ignoring the last occurrence)
users.loc[users.duplicated(keep="last"), :]

	age	gender	occupation	zip_code
user_id
67	17	M	student	60402
85	51	M	educator	20003
198	21	F	student	55414
350	32	M	student	97301
428	28	M	student	55414
437	27	F	other	20009
460	44	F	other	60630

# examine the duplicate rows (including all duplicates)
users.loc[users.duplicated(keep=False), :]

	age	gender	occupation	zip_code
user_id
67	17	M	student	60402
85	51	M	educator	20003
198	21	F	student	55414
350	32	M	student	97301
428	28	M	student	55414
437	27	F	other	20009
460	44	F	other	60630
496	21	F	student	55414
572	51	M	educator	20003
621	17	M	student	60402
684	28	M	student	55414
733	44	F	other	60630
805	27	F	other	20009
890	32	M	student	97301

# drop the duplicate rows (inplace=False by default)
users.drop_duplicates(keep="first").shape

(936, 4)

users.drop_duplicates(keep="last").shape

(936, 4)

users.drop_duplicates(keep=False).shape

(929, 4)

# only consider a subset of columns when identifying duplicates
users.duplicated(subset=["age", "zip_code"]).sum()

np.int64(16)

users.drop_duplicates(subset=["age", "zip_code"]).shape

(927, 4)

Detect missing values#

ufo = pd.read_csv("data/ufo.csv")
ufo.tail()

	City	Colors Reported	Shape Reported	State	Time
18236	Grant Park	NaN	TRIANGLE	IL	12/31/2000 23:00
18237	Spirit Lake	NaN	DISK	IA	12/31/2000 23:00
18238	Eagle River	NaN	NaN	WI	12/31/2000 23:45
18239	Eagle River	RED	LIGHT	WI	12/31/2000 23:45
18240	Ybor	NaN	OVAL	FL	12/31/2000 23:59

ufo.isna().tail()

	City	Colors Reported	Shape Reported	State	Time
18236	False	True	False	False	False
18237	False	True	False	False	False
18238	False	True	True	False	False
18239	False	False	False	False	False
18240	False	True	False	False	False

ufo.notna().tail()

	City	Colors Reported	Shape Reported	State	Time
18236	True	False	True	True	True
18237	True	False	True	True	True
18238	True	False	False	True	True
18239	True	True	True	True	True
18240	True	False	True	True	True

# count the number of missing values in each Series
ufo.isna().sum()

City                  26
Colors Reported    15359
Shape Reported      2644
State                  0
Time                   0
dtype: int64

# use the 'isna' Series method to filter the DataFrame rows
ufo.loc[ufo.City.isna()].head()

	City	Colors Reported	Shape Reported	State	Time
21	NaN	NaN	NaN	LA	8/15/1943 0:00
22	NaN	NaN	LIGHT	LA	8/15/1943 0:00
204	NaN	NaN	DISK	CA	7/15/1952 12:30
241	NaN	BLUE	DISK	MT	7/4/1953 14:00
613	NaN	NaN	DISK	NV	7/1/1960 12:00

# examine the number of rows and columns
ufo.shape

(18241, 5)

# if 'any' values are missing in a row, then drop that row
ufo.dropna(how="any").shape

(2486, 5)

# 'inplace' parameter for 'dropna' is False by default, thus rows were only dropped temporarily
ufo.shape

(18241, 5)

# if 'all' values are missing in a row, then drop that row (none are dropped in this case)
ufo.dropna(how="all").shape

(18241, 5)

# if 'any' values are missing in a row (considering only 'City' and 'Shape Reported'), then drop that row
ufo.dropna(subset=["City", "Shape Reported"], how="any").shape

(15575, 5)

# 'value_counts' does not include missing values by default
ufo["Shape Reported"].value_counts().head()

Shape Reported
LIGHT       2803
DISK        2122
TRIANGLE    1889
OTHER       1402
CIRCLE      1365
Name: count, dtype: int64

# explicitly include missing values
ufo["Shape Reported"].value_counts(dropna=False).head()

Shape Reported
LIGHT       2803
NaN         2644
DISK        2122
TRIANGLE    1889
OTHER       1402
Name: count, dtype: int64

Fill missing values#

# fill in missing values with a specified value
ufo["Shape Reported"] = ufo["Shape Reported"].fillna(value="VARIOUS")

# confirm that the missing values were filled in
ufo["Shape Reported"].value_counts().head()

Shape Reported
VARIOUS     2977
LIGHT       2803
DISK        2122
TRIANGLE    1889
OTHER       1402
Name: count, dtype: int64

# fill in missing values
ufo.fillna(value="UNKNOWN").head()

	City	Colors Reported	Shape Reported	State	Time
0	Ithaca	UNKNOWN	TRIANGLE	NY	6/1/1930 22:00
1	Willingboro	UNKNOWN	OTHER	NJ	6/30/1930 20:00
2	Holyoke	UNKNOWN	OVAL	CO	2/15/1931 14:00
3	Abilene	UNKNOWN	DISK	KS	6/1/1931 13:00
4	New York Worlds Fair	UNKNOWN	LIGHT	NY	4/18/1933 19:00

# fill missing values using "backward fill" strategy (doesn't affect the DataFrame since inplace=False)
ufo.fillna(method="bfill").tail()

/tmp/ipykernel_2008/278037071.py:2: FutureWarning: DataFrame.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.
  ufo.fillna(method="bfill").tail()

	City	Colors Reported	Shape Reported	State	Time
18236	Grant Park	RED	TRIANGLE	IL	12/31/2000 23:00
18237	Spirit Lake	RED	DISK	IA	12/31/2000 23:00
18238	Eagle River	RED	VARIOUS	WI	12/31/2000 23:45
18239	Eagle River	RED	LIGHT	WI	12/31/2000 23:45
18240	Ybor	NaN	OVAL	FL	12/31/2000 23:59

# compare with "forward fill" strategy (doesn't affect the DataFrame since inplace=False)
ufo.fillna(method="ffill").tail()

/tmp/ipykernel_2008/2160560122.py:2: FutureWarning: DataFrame.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.
  ufo.fillna(method="ffill").tail()

	City	Colors Reported	Shape Reported	State	Time
18236	Grant Park	RED	TRIANGLE	IL	12/31/2000 23:00
18237	Spirit Lake	RED	DISK	IA	12/31/2000 23:00
18238	Eagle River	RED	VARIOUS	WI	12/31/2000 23:45
18239	Eagle River	RED	LIGHT	WI	12/31/2000 23:45
18240	Ybor	RED	OVAL	FL	12/31/2000 23:59

Where & Mask#

train = pd.read_csv("data/titanic_train.csv")
train.head()

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Ticket	Fare	Cabin	Embarked
0	1	0	3	Braund, Mr. Owen Harris	male	22.0	1	A/5 21171	7.2500	NaN	S
1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38.0	1	PC 17599	71.2833	C85	C
2	3	1	3	Heikkinen, Miss. Laina	female	26.0	0	STON/O2. 3101282	7.9250	NaN	S
3	4	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	female	35.0	1	113803	53.1000	C123	S
4	5	0	3	Allen, Mr. William Henry	male	35.0	0	373450	8.0500	NaN	S

cond = train["Sex"] == "male"
# replace the values unsatisfied the condition, default is NaN
train["Sex"] = train["Sex"].where(cond, other="Female")
train.head()

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Ticket	Fare	Cabin	Embarked
0	1	0	3	Braund, Mr. Owen Harris	male	22.0	1	A/5 21171	7.2500	NaN	S
1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	Female	38.0	1	PC 17599	71.2833	C85	C
2	3	1	3	Heikkinen, Miss. Laina	Female	26.0	0	STON/O2. 3101282	7.9250	NaN	S
3	4	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	Female	35.0	1	113803	53.1000	C123	S
4	5	0	3	Allen, Mr. William Henry	male	35.0	0	373450	8.0500	NaN	S

train["quality"] = ""
cond1 = train["Sex"] == "male"
cond2 = train["Age"] > 25
# display the values unsatisfied the condition
train["quality"] = train["quality"].where(cond1 & cond2, other="low")
train.head()

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Ticket	Fare	Cabin	Embarked	quality
0	1	0	3	Braund, Mr. Owen Harris	male	22.0	1	A/5 21171	7.2500	NaN	S	low
1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	Female	38.0	1	PC 17599	71.2833	C85	C	low
2	3	1	3	Heikkinen, Miss. Laina	Female	26.0	0	STON/O2. 3101282	7.9250	NaN	S	low
3	4	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	Female	35.0	1	113803	53.1000	C123	S	low
4	5	0	3	Allen, Mr. William Henry	male	35.0	0	373450	8.0500	NaN	S

train["quality"] = ""
cond1 = train["Sex"] == "male"
cond2 = train["Age"] > 25
# mask the values unsatisfied the condition
train["quality"] = train["quality"].mask(cond1 & cond2, other="low")
train.head()

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Ticket	Fare	Cabin	Embarked	quality
0	1	0	3	Braund, Mr. Owen Harris	male	22.0	1	A/5 21171	7.2500	NaN	S
1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	Female	38.0	1	PC 17599	71.2833	C85	C
2	3	1	3	Heikkinen, Miss. Laina	Female	26.0	0	STON/O2. 3101282	7.9250	NaN	S
3	4	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	Female	35.0	1	113803	53.1000	C123	S
4	5	0	3	Allen, Mr. William Henry	male	35.0	0	373450	8.0500	NaN	S	low

Outliers (clip)#

drinks["wine_servings"].clip(10, 300)

     10
     54
     14
    300
     45
      ... 
   10
   10
   10
   10
   10
Name: wine_servings, Length: 193, dtype: int64

String & Time#

Change the data type#

drinks = pd.read_csv("data/drinks.csv")
drinks.head()

	country	beer_servings	spirit_servings	wine_servings	total_litres_of_pure_alcohol	continent
0	Afghanistan	0	0	0	0.0	Asia
1	Albania	89	132	54	4.9	Europe
2	Algeria	25	0	14	0.7	Africa
3	Andorra	245	138	312	12.4	Europe
4	Angola	217	57	45	5.9	Africa

# examine the data type of each Series
drinks.dtypes

country                          object
beer_servings                     int64
spirit_servings                   int64
wine_servings                     int64
total_litres_of_pure_alcohol    float64
continent                        object
dtype: object

drinks.convert_dtypes().dtypes

country                         string[python]
beer_servings                            Int64
spirit_servings                          Int64
wine_servings                            Int64
total_litres_of_pure_alcohol           Float64
continent                       string[python]
dtype: object

# change the data type of an existing Series
drinks["beer_servings"] = drinks["beer_servings"].astype(float)
drinks.dtypes

country                          object
beer_servings                   float64
spirit_servings                   int64
wine_servings                     int64
total_litres_of_pure_alcohol    float64
continent                        object
dtype: object

Use string methods#

orders = pd.read_csv("data/chipotle.tsv", sep="\t")
orders.head()

	order_id	quantity	item_name	choice_description	item_price
0	1	1	Chips and Fresh Tomato Salsa	NaN	$2.39
1	1	1	Izze	[Clementine]	$3.39
2	1	1	Nantucket Nectar	[Apple]	$3.39
3	1	1	Chips and Tomatillo-Green Chili Salsa	NaN	$2.39
4	2	2	Chicken Bowl	[Tomatillo-Red Chili Salsa (Hot), [Black Beans...	$16.98

# string methods for pandas Series are accessed via 'str'
orders.item_name.str.upper().head()

           CHIPS AND FRESH TOMATO SALSA
                                   IZZE
                       NANTUCKET NECTAR
  CHIPS AND TOMATILLO-GREEN CHILI SALSA
                           CHICKEN BOWL
Name: item_name, dtype: object

# use the boolean Series to filter the DataFrame
orders[orders.item_name.str.contains("Chicken")].head()

	order_id	quantity	item_name	choice_description	item_price
4	2	2	Chicken Bowl	[Tomatillo-Red Chili Salsa (Hot), [Black Beans...	$16.98
5	3	1	Chicken Bowl	[Fresh Tomato Salsa (Mild), [Rice, Cheese, Sou...	$10.98
11	6	1	Chicken Crispy Tacos	[Roasted Chili Corn Salsa, [Fajita Vegetables,...	$8.75
12	6	1	Chicken Soft Tacos	[Roasted Chili Corn Salsa, [Rice, Black Beans,...	$8.75
13	7	1	Chicken Bowl	[Fresh Tomato Salsa, [Fajita Vegetables, Rice,...	$11.25

# string methods can be chained together
orders.choice_description.str.replace("[", "").str.replace("]", "").head()

                                                NaN
                                         Clementine
                                              Apple
                                                NaN
  Tomatillo-Red Chili Salsa (Hot), Black Beans, ...
Name: choice_description, dtype: object

# many pandas string methods support regular expressions (regex)
orders.choice_description.str.replace(r"[\[\]]", "").head()

                                                NaN
                                       [Clementine]
                                            [Apple]
                                                NaN
  [Tomatillo-Red Chili Salsa (Hot), [Black Beans...
Name: choice_description, dtype: object

Dates & Times#

# read a dataset of UFO reports into a DataFrame
ufo = pd.read_csv("data/ufo.csv")
ufo.head()

	City	Colors Reported	Shape Reported	State	Time
0	Ithaca	NaN	TRIANGLE	NY	6/1/1930 22:00
1	Willingboro	NaN	OTHER	NJ	6/30/1930 20:00
2	Holyoke	NaN	OVAL	CO	2/15/1931 14:00
3	Abilene	NaN	DISK	KS	6/1/1931 13:00
4	New York Worlds Fair	NaN	LIGHT	NY	4/18/1933 19:00

# 'Time' is currently stored as a string
ufo.dtypes

City               object
Colors Reported    object
Shape Reported     object
State              object
Time               object
dtype: object

# convert 'Time' to datetime format
ufo["Time"] = pd.to_datetime(ufo.Time)
ufo.head()

	City	Colors Reported	Shape Reported	State	Time
0	Ithaca	NaN	TRIANGLE	NY	1930-06-01 22:00:00
1	Willingboro	NaN	OTHER	NJ	1930-06-30 20:00:00
2	Holyoke	NaN	OVAL	CO	1931-02-15 14:00:00
3	Abilene	NaN	DISK	KS	1931-06-01 13:00:00
4	New York Worlds Fair	NaN	LIGHT	NY	1933-04-18 19:00:00

ufo.dtypes

City                       object
Colors Reported            object
Shape Reported             object
State                      object
Time               datetime64[ns]
dtype: object

ufo = ufo.set_index("Time")
ufo.head()

	City	Colors Reported	Shape Reported	State
Time
1930-06-01 22:00:00	Ithaca	NaN	TRIANGLE	NY
1930-06-30 20:00:00	Willingboro	NaN	OTHER	NJ
1931-02-15 14:00:00	Holyoke	NaN	OVAL	CO
1931-06-01 13:00:00	Abilene	NaN	DISK	KS
1933-04-18 19:00:00	New York Worlds Fair	NaN	LIGHT	NY

ufo.at_time("15:00")

	City	Colors Reported	Shape Reported	State
Time
1943-06-01 15:00:00	Nebraska	NaN	DISK	NE
1947-07-15 15:00:00	Clintwood	NaN	CYLINDER	VA
1947-07-15 15:00:00	Hazelton	NaN	DISK	ID
1949-04-10 15:00:00	Seattle	NaN	DISK	WA
1949-06-30 15:00:00	Knoxville	NaN	OTHER	TN
...	...	...	...	...
2000-10-12 15:00:00	Mountain City	NaN	DISK	TN
2000-10-28 15:00:00	Kintnersville	ORANGE	OTHER	PA
2000-11-20 15:00:00	St. Joseph	NaN	EGG	MO
2000-12-10 15:00:00	Marshfield	NaN	LIGHT	WI
2000-12-27 15:00:00	Kea'au	NaN	SPHERE	HI

254 rows × 4 columns

ufo.between_time("09:45", "12:00")

	City	Colors Reported	Shape Reported	State
Time
1941-07-02 11:30:00	Forest Home	NaN	CIRCLE	CA
1943-10-15 11:00:00	Owensboro	NaN	RECTANGLE	KY
1944-01-01 10:00:00	Wilderness	NaN	DISK	WV
1944-01-01 12:00:00	San Diego	NaN	CIGAR	CA
1944-01-01 12:00:00	Wilderness	NaN	DISK	WV
...	...	...	...	...
2000-12-27 10:00:00	Santa Paula	NaN	LIGHT	CA
2000-12-28 11:00:00	Cerrilillo	NaN	NaN	NM
2000-12-28 12:00:00	Mansfield	NaN	FLASH	TX
2000-12-30 10:00:00	Simi Valley	NaN	FORMATION	CA
2000-12-31 11:45:00	Chicago	NaN	DISK	IL

781 rows × 4 columns

Category data#

drinks = pd.read_csv("data/drinks.csv")
drinks.head()

	country	beer_servings	spirit_servings	wine_servings	total_litres_of_pure_alcohol	continent
0	Afghanistan	0	0	0	0.0	Asia
1	Albania	89	132	54	4.9	Europe
2	Algeria	25	0	14	0.7	Africa
3	Andorra	245	138	312	12.4	Europe
4	Angola	217	57	45	5.9	Africa

drinks = drinks.convert_dtypes()
drinks.dtypes

country                         string[python]
beer_servings                            Int64
spirit_servings                          Int64
wine_servings                            Int64
total_litres_of_pure_alcohol           Float64
continent                       string[python]
dtype: object

# calculate the memory usage for each Series (in bytes)
drinks.memory_usage(deep=True)

Index                             132
country                         11044
beer_servings                    1737
spirit_servings                  1737
wine_servings                    1737
total_litres_of_pure_alcohol     1737
continent                       10788
dtype: int64

# use the 'category' data type to store the 'continent' strings as integers
drinks["continent"] = drinks["continent"].astype("category")
drinks.dtypes

country                         string[python]
beer_servings                            Int64
spirit_servings                          Int64
wine_servings                            Int64
total_litres_of_pure_alcohol           Float64
continent                             category
dtype: object

# strings are now encoded (0 means 'Africa', 1 means 'Asia', 2 means 'Europe', etc.)
drinks["continent"].cat.codes.head()

  1
  2
  0
  2
  0
dtype: int8

# memory usage has been drastically reduced
drinks.memory_usage(deep=True)

Index                             132
country                         11044
beer_servings                    1737
spirit_servings                  1737
wine_servings                    1737
total_litres_of_pure_alcohol     1737
continent                         708
dtype: int64

# repeat this process for the 'country' Series
drinks["country"] = drinks["country"].astype("category")
drinks.memory_usage(deep=True)

Index                             132
country                         15598
beer_servings                    1737
spirit_servings                  1737
wine_servings                    1737
total_litres_of_pure_alcohol     1737
continent                         708
dtype: int64

# memory usage increased because we created 193 categories
drinks["country"].cat.categories

Index(['Afghanistan', 'Albania', 'Algeria', 'Andorra', 'Angola',
       'Antigua & Barbuda', 'Argentina', 'Armenia', 'Australia', 'Austria',
       ...
       'United Arab Emirates', 'United Kingdom', 'Uruguay', 'Uzbekistan',
       'Vanuatu', 'Venezuela', 'Vietnam', 'Yemen', 'Zambia', 'Zimbabwe'],
      dtype='string', length=193)

Dummy variables#

train = pd.read_csv("data/titanic_train.csv")
train.head()

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Ticket	Fare	Cabin	Embarked
0	1	0	3	Braund, Mr. Owen Harris	male	22.0	1	A/5 21171	7.2500	NaN	S
1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38.0	1	PC 17599	71.2833	C85	C
2	3	1	3	Heikkinen, Miss. Laina	female	26.0	0	STON/O2. 3101282	7.9250	NaN	S
3	4	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	female	35.0	1	113803	53.1000	C123	S
4	5	0	3	Allen, Mr. William Henry	male	35.0	0	373450	8.0500	NaN	S

# use 'get_dummies' to create one column for every possible value
pd.get_dummies(train.Sex).head()

	female	male
0	False	True
1	True	False
2	True	False
3	True	False
4	False	True

# drop the first dummy variable ('female') using the 'iloc' method
# add a prefix to identify the source of the dummy variables
pd.get_dummies(train.Sex, prefix="Sex").iloc[:, 1:].head()

	Sex_male
0	True
1	False
2	False
3	False
4	True

# use 'get_dummies' with a feature that has 3 possible values
pd.get_dummies(train.Embarked, prefix="Embarked").head(10)

	Embarked_C	Embarked_Q	Embarked_S
0	False	False	True
1	True	False	False
2	False	False	True
3	False	False	True
4	False	False	True
5	False	True	False
6	False	False	True
7	False	False	True
8	False	False	True
9	True	False	False

# drop the first dummy variable ('C')
pd.get_dummies(train.Embarked, prefix="Embarked").iloc[:, 1:].head(10)

	Embarked_Q	Embarked_S
0	False	True
1	False	False
2	False	True
3	False	True
4	False	True
5	True	False
6	False	True
7	False	True
8	False	True
9	False	False

# save the DataFrame of dummy variables and concatenate them to the original DataFrame
embarked_dummies = pd.get_dummies(train.Embarked, prefix="Embarked").iloc[:, 1:]
train = pd.concat([train, embarked_dummies], axis=1)
train.head()

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Ticket	Fare	Cabin	Embarked	Embarked_Q	Embarked_S
0	1	0	3	Braund, Mr. Owen Harris	male	22.0	1	A/5 21171	7.2500	NaN	S	False	True
1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38.0	1	PC 17599	71.2833	C85	C	False	False
2	3	1	3	Heikkinen, Miss. Laina	female	26.0	0	STON/O2. 3101282	7.9250	NaN	S	False	True
3	4	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	female	35.0	1	113803	53.1000	C123	S	False	True
4	5	0	3	Allen, Mr. William Henry	male	35.0	0	373450	8.0500	NaN	S	False	True

train = pd.read_csv("data/titanic_train.csv")
train.head()

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Ticket	Fare	Cabin	Embarked
0	1	0	3	Braund, Mr. Owen Harris	male	22.0	1	A/5 21171	7.2500	NaN	S
1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38.0	1	PC 17599	71.2833	C85	C
2	3	1	3	Heikkinen, Miss. Laina	female	26.0	0	STON/O2. 3101282	7.9250	NaN	S
3	4	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	female	35.0	1	113803	53.1000	C123	S
4	5	0	3	Allen, Mr. William Henry	male	35.0	0	373450	8.0500	NaN	S

# pass the DataFrame to 'get_dummies' and specify which columns to dummy (it drops the original columns)
pd.get_dummies(train, columns=["Sex", "Embarked"]).head()

	PassengerId	Survived	Pclass	Name	Age	SibSp	Ticket	Fare	Cabin	Sex_female	Sex_male	Embarked_C	Embarked_Q	Embarked_S
0	1	0	3	Braund, Mr. Owen Harris	22.0	1	A/5 21171	7.2500	NaN	False	True	False	False	True
1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	38.0	1	PC 17599	71.2833	C85	True	False	True	False	False
2	3	1	3	Heikkinen, Miss. Laina	26.0	0	STON/O2. 3101282	7.9250	NaN	True	False	False	False	True
3	4	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	35.0	1	113803	53.1000	C123	True	False	False	False	True
4	5	0	3	Allen, Mr. William Henry	35.0	0	373450	8.0500	NaN	False	True	False	False	True

# use the 'drop_first' parameter (new in pandas 0.18) to drop the first dummy variable for each feature
pd.get_dummies(train, columns=["Sex", "Embarked"], drop_first=True).head()

	PassengerId	Survived	Pclass	Name	Age	SibSp	Ticket	Fare	Cabin	Sex_male	Embarked_Q	Embarked_S
0	1	0	3	Braund, Mr. Owen Harris	22.0	1	A/5 21171	7.2500	NaN	True	False	True
1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	38.0	1	PC 17599	71.2833	C85	False	False	False
2	3	1	3	Heikkinen, Miss. Laina	26.0	0	STON/O2. 3101282	7.9250	NaN	False	False	True
3	4	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	35.0	1	113803	53.1000	C123	False	False	True
4	5	0	3	Allen, Mr. William Henry	35.0	0	373450	8.0500	NaN	True	False	True

Cardinal data#

# create a small DataFrame from a dictionary
small = pd.DataFrame({
    "ID": [100, 101, 102, 103],
    "quality": ["good", "very good", "good", "excellent"],
})
small.head()

	ID	quality
0	100	good
1	101	very good
2	102	good
3	103	excellent

# sort the DataFrame by the 'quality' Series (alphabetical order)
small.sort_values("quality")

	ID	quality
3	103	excellent
0	100	good
2	102	good
1	101	very good

from pandas.api.types import CategoricalDtype

# define a logical ordering for the categories
cats = ["good", "very good", "excellent"]
cat_type = CategoricalDtype(categories=cats, ordered=True)
small["quality"] = small.quality.astype(cat_type)
small.quality

0         good
1    very good
2         good
3    excellent
Name: quality, dtype: category
Categories (3, object): ['good' < 'very good' < 'excellent']

# sort the DataFrame by the 'quality' Series (logical order)
small.sort_values("quality")

	ID	quality
0	100	good
2	102	good
1	101	very good
3	103	excellent

# comparison operators work with ordered categories
small.loc[small.quality > "good", :]

	ID	quality
1	101	very good
3	103	excellent

Sample rows#

# use the 'random_state' parameter for reproducibility
ufo.sample(n=3, random_state=42)

	City	Colors Reported	Shape Reported	State
Time
1952-09-15 14:00:00	Norridgewock	NaN	DISK	ME
1998-10-01 21:15:00	Ipava	NaN	TRIANGLE	IL
2000-11-13 22:00:00	Ellinwood	NaN	FIREBALL	KS

# sample 75% of the DataFrame's rows without replacement
train = ufo.sample(frac=0.75, random_state=99)

# store the remaining 25% of the rows in another DataFrame
test = ufo.loc[~ufo.index.isin(train.index), :]
test.head()

	City	Colors Reported	Shape Reported	State
Time
1933-04-18 19:00:00	New York Worlds Fair	NaN	LIGHT	NY
1934-09-15 15:30:00	Valley City	NaN	DISK	ND
1936-10-15 17:00:00	Eklutna	NaN	CIGAR	AK
1939-06-01 20:00:00	Waterloo	NaN	FIREBALL	AL
1939-07-07 02:00:00	Keokuk	NaN	OVAL	IA

清洗和预处理

Contents

清洗和预处理#

Explore#

Max & Min#

Count#

Sort#

Correlation#

Transform#

Group#

Apply#

Apply a function#

MultiIndex#

Pivot#

Clean#

Remove duplicate rows#

Detect missing values#

Fill missing values#

Where & Mask#

Outliers (clip)#

String & Time#

Change the data type#

Use string methods#

Dates & Times#

Category data#

Dummy variables#

Cardinal data#

Sample rows#