清洗和预处理

清洗和预处理#

import polars as pl

Explore#

drinks = pl.read_csv("data/drinks.csv")
drinks.head()

shape: (5, 6)

country	beer_servings	spirit_servings	wine_servings	total_litres_of_pure_alcohol	continent
str	i64	i64	i64	f64	str
"Afghanistan"	0	0	0	0.0	"Asia"
"Albania"	89	132	54	4.9	"Europe"
"Algeria"	25	0	14	0.7	"Africa"
"Andorra"	245	138	312	12.4	"Europe"
"Angola"	217	57	45	5.9	"Africa"

Max & Min#

drinks.top_k(10, by="beer_servings")

shape: (10, 6)

country	beer_servings	spirit_servings	wine_servings	total_litres_of_pure_alcohol	continent
str	i64	i64	i64	f64	str
"Namibia"	376	3	1	6.8	"Africa"
"Czech Republic"	361	170	134	11.8	"Europe"
"Gabon"	347	98	59	8.9	"Africa"
"Germany"	346	117	175	11.3	"Europe"
"Poland"	343	215	56	10.9	"Europe"
"Lithuania"	343	244	56	12.9	"Europe"
"Venezuela"	333	100	3	7.7	"South America"
"Ireland"	313	118	165	11.4	"Europe"
"Palau"	306	63	23	6.9	"Oceania"
"Romania"	297	122	167	10.4	"Europe"

drinks["beer_servings"].top_k(10)

shape: (10,)

beer_servings
i64
343
361
347
346
343
376
333
313
306
297

Count#

drinks["continent"].value_counts()

shape: (6, 2)

continent	count
str	u32
"North America"	23
"Oceania"	16
"Africa"	53
"South America"	12
"Asia"	44
"Europe"	45

drinks["continent"].value_counts(normalize=True)

shape: (6, 2)

continent	proportion
str	f64
"Europe"	0.233161
"Africa"	0.274611
"Oceania"	0.082902
"South America"	0.062176
"Asia"	0.227979
"North America"	0.119171

drinks["continent"].value_counts().columns

['continent', 'count']

# any Series can be sorted by its values
drinks["continent"].value_counts().sort(by="count")

shape: (6, 2)

continent	count
str	u32
"South America"	12
"Oceania"	16
"North America"	23
"Asia"	44
"Europe"	45
"Africa"	53

drinks["continent"].unique()

shape: (6,)

continent
str
"South America"
"Oceania"
"Asia"
"Africa"
"Europe"
"North America"

drinks["continent"].unique_counts()

shape: (6,)

continent
u32
44
45
53
23
12
16

Sort#

movies = pl.read_csv("data/imdb_1000.csv")
movies.head()

shape: (5, 6)

star_rating	title	content_rating	genre	duration	actors_list
f64	str	str	str	i64	str
9.3	"The Shawshank Redemption"	"R"	"Crime"	142	"[u'Tim Robbins', u'Morgan Free…
9.2	"The Godfather"	"R"	"Crime"	175	"[u'Marlon Brando', u'Al Pacino…
9.1	"The Godfather: Part II"	"R"	"Crime"	200	"[u'Al Pacino', u'Robert De Nir…
9.0	"The Dark Knight"	"PG-13"	"Action"	152	"[u'Christian Bale', u'Heath Le…
8.9	"Pulp Fiction"	"R"	"Crime"	154	"[u'John Travolta', u'Uma Thurm…

# sort the 'title' Series in ascending order (returns a Series)
movies["title"].sort().head()

shape: (10,)

title
str
"(500) Days of Summer"
"12 Angry Men"
"12 Years a Slave"
"127 Hours"
"2001: A Space Odyssey"
"2046"
"21 Grams"
"25th Hour"
"28 Days Later..."
"3 Idiots"

# sort in descending order instead
movies["title"].sort(descending=True).head()

shape: (10,)

title
str
"[Rec]"
"Zulu"
"Zombieland"
"Zodiac"
"Zero Dark Thirty"
"Zelig"
"Young Frankenstein"
"Yojimbo"
"Yip Man"
"Ying xiong"

# sort the entire DataFrame by the 'title' Series (returns a DataFrame)
movies.sort(by="title").head()

shape: (5, 6)

star_rating	title	content_rating	genre	duration	actors_list
f64	str	str	str	i64	str
7.8	"(500) Days of Summer"	"PG-13"	"Comedy"	95	"[u'Zooey Deschanel', u'Joseph …
8.9	"12 Angry Men"	"NOT RATED"	"Drama"	96	"[u'Henry Fonda', u'Lee J. Cobb…
8.1	"12 Years a Slave"	"R"	"Biography"	134	"[u'Chiwetel Ejiofor', u'Michae…
7.6	"127 Hours"	"R"	"Adventure"	94	"[u'James Franco', u'Amber Tamb…
8.3	"2001: A Space Odyssey"	"G"	"Mystery"	160	"[u'Keir Dullea', u'Gary Lockwo…

# sort in descending order instead
movies.sort(by="title", descending=False).head()

shape: (5, 6)

star_rating	title	content_rating	genre	duration	actors_list
f64	str	str	str	i64	str
7.8	"(500) Days of Summer"	"PG-13"	"Comedy"	95	"[u'Zooey Deschanel', u'Joseph …
8.9	"12 Angry Men"	"NOT RATED"	"Drama"	96	"[u'Henry Fonda', u'Lee J. Cobb…
8.1	"12 Years a Slave"	"R"	"Biography"	134	"[u'Chiwetel Ejiofor', u'Michae…
7.6	"127 Hours"	"R"	"Adventure"	94	"[u'James Franco', u'Amber Tamb…
8.3	"2001: A Space Odyssey"	"G"	"Mystery"	160	"[u'Keir Dullea', u'Gary Lockwo…

# sort the DataFrame first by 'content_rating', then by 'duration'
movies.sort(by=["content_rating", "duration"]).head()

shape: (5, 6)

star_rating	title	content_rating	genre	duration	actors_list
f64	str	str	str	i64	str
8.2	"Butch Cassidy and the Sundance…	null	"Biography"	110	"[u'Paul Newman', u'Robert Redf…
7.4	"True Grit"	null	"Adventure"	128	"[u'John Wayne', u'Kim Darby', …
7.7	"Where Eagles Dare"	null	"Action"	158	"[u'Richard Burton', u'Clint Ea…
7.6	"The Jungle Book"	"APPROVED"	"Animation"	78	"[u'Phil Harris', u'Sebastian C…
7.8	"Invasion of the Body Snatchers"	"APPROVED"	"Horror"	80	"[u'Kevin McCarthy', u'Dana Wyn…

Transform#

Group#

drinks = pl.read_csv("data/drinks.csv")
drinks.head()

shape: (5, 6)

country	beer_servings	spirit_servings	wine_servings	total_litres_of_pure_alcohol	continent
str	i64	i64	i64	f64	str
"Afghanistan"	0	0	0	0.0	"Asia"
"Albania"	89	132	54	4.9	"Europe"
"Algeria"	25	0	14	0.7	"Africa"
"Andorra"	245	138	312	12.4	"Europe"
"Angola"	217	57	45	5.9	"Africa"

# calculate the mean beer servings for each continent
drinks.group_by("continent").mean()["beer_servings"]

shape: (6,)

beer_servings
f64
193.777778
61.471698
145.434783
37.045455
175.083333
89.6875

# other aggregation functions (such as 'max') can also be used with groupby
drinks.group_by("continent").max()["beer_servings"]

shape: (6,)

beer_servings
i64
333
376
361
247
306
285

# multiple aggregation functions can be applied simultaneously
drinks.group_by("continent").agg(
    pl.col("beer_servings").count().name.suffix("_count"),
    pl.col("beer_servings").mean().name.suffix("_mean"),
    pl.col("beer_servings").min().name.suffix("_min"),
    pl.col("beer_servings").max().name.suffix("_max"),
)

shape: (6, 5)

continent	beer_servings_count	beer_servings_mean	beer_servings_min	beer_servings_max
str	u32	f64	i64	i64
"Oceania"	16	89.6875	0	306
"North America"	23	145.434783	1	285
"South America"	12	175.083333	93	333
"Asia"	44	37.045455	0	247
"Africa"	53	61.471698	0	376
"Europe"	45	193.777778	0	361

Apply#

drinks.group_by("continent").mean()

shape: (6, 6)

continent	country	beer_servings	spirit_servings	wine_servings	total_litres_of_pure_alcohol
str	str	f64	f64	f64	f64
"North America"	null	145.434783	165.73913	24.521739	5.995652
"Europe"	null	193.777778	132.555556	142.222222	8.617778
"Oceania"	null	89.6875	58.4375	35.625	3.38125
"Asia"	null	37.045455	60.840909	9.068182	2.170455
"Africa"	null	61.471698	16.339623	16.264151	3.007547
"South America"	null	175.083333	114.75	62.416667	6.308333

iris = pl.read_csv("data/iris.csv")
iris.head()

shape: (5, 5)

sepal_length	sepal_width	petal_length	petal_width	species
f64	f64	f64	f64	str
5.1	3.5	1.4	0.2	"setosa"
4.9	3.0	1.4	0.2	"setosa"
4.7	3.2	1.3	0.2	"setosa"
4.6	3.1	1.5	0.2	"setosa"
5.0	3.6	1.4	0.2	"setosa"

iris.group_by("species").len()

shape: (3, 2)

species	len
str	u32
"versicolor"	50
"virginica"	50
"setosa"	50

iris.group_by("species").agg([
    pl.min("sepal_length").name.suffix("_min"),
    pl.max("sepal_length").name.suffix("_max"),
    pl.mean("petal_length").name.suffix("_mean"),
    pl.std("petal_length").name.suffix("_std"),
])

shape: (3, 5)

species	sepal_length_min	sepal_length_max	petal_length_mean	petal_length_std
str	f64	f64	f64	f64
"versicolor"	4.9	7.0	4.26	0.469911
"setosa"	4.3	5.8	1.462	0.173664
"virginica"	4.9	7.9	5.552	0.551895

Apply a function#

train = pl.read_csv("data/titanic_train.csv")
train.head()

shape: (5, 12)

PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked
i64	i64	i64	str	str	f64	i64	i64	str	f64	str	str
1	0	3	"Braund, Mr. Owen Harris"	"male"	22.0	1	0	"A/5 21171"	7.25	null	"S"
2	1	1	"Cumings, Mrs. John Bradley (Fl…	"female"	38.0	1	0	"PC 17599"	71.2833	"C85"	"C"
3	1	3	"Heikkinen, Miss. Laina"	"female"	26.0	0	0	"STON/O2. 3101282"	7.925	null	"S"
4	1	1	"Futrelle, Mrs. Jacques Heath (…	"female"	35.0	1	0	"113803"	53.1	"C123"	"S"
5	0	3	"Allen, Mr. William Henry"	"male"	35.0	0	0	"373450"	8.05	null	"S"

# map 'female' to 0 and 'male' to 1
train = train.with_columns(Sex_num=pl.col("Sex").replace({"female": 0, "male": 1}))
train[0:4, ["Sex", "Sex_num"]]

shape: (4, 2)

Sex	Sex_num
str	str
"male"	"1"
"female"	"0"
"female"	"0"
"female"	"0"

# calculate the length of each string in the 'Name' Series
train = train.with_columns(Name_length=pl.col("Name").len())
train[0:4, ["Name", "Name_length"]]

shape: (4, 2)

Name	Name_length
str	u32
"Braund, Mr. Owen Harris"	891
"Cumings, Mrs. John Bradley (Fl…	891
"Heikkinen, Miss. Laina"	891
"Futrelle, Mrs. Jacques Heath (…	891

# use a string method to split the 'Name' Series at commas (returns a Series of lists)
train["Name"].str.split(",").head()

shape: (10,)

Name
list[str]
["Braund", " Mr. Owen Harris"]
["Cumings", " Mrs. John Bradley (Florence Briggs Thayer)"]
["Heikkinen", " Miss. Laina"]
["Futrelle", " Mrs. Jacques Heath (Lily May Peel)"]
["Allen", " Mr. William Henry"]
["Moran", " Mr. James"]
["McCarthy", " Mr. Timothy J"]
["Palsson", " Master. Gosta Leonard"]
["Johnson", " Mrs. Oscar W (Elisabeth Vilhelmina Berg)"]
["Nasser", " Mrs. Nicholas (Adele Achem)"]

#  use a lambda function
train["Name"].str.split(",").map_elements(lambda x: x[0], return_dtype=pl.String).head()

shape: (10,)

Name
str
"Braund"
"Cumings"
"Heikkinen"
"Futrelle"
"Allen"
"Moran"
"McCarthy"
"Palsson"
"Johnson"
"Nasser"

Pivot#

stocks = pl.read_csv("data/stocks.csv")
stocks

shape: (9, 4)

Date	Close	Volume	Symbol
str	f64	i64	str
"2016-10-03"	31.5	14070500	"CSCO"
"2016-10-03"	112.52	21701800	"AAPL"
"2016-10-03"	57.42	19189500	"MSFT"
"2016-10-04"	113.0	29736800	"AAPL"
"2016-10-04"	57.24	20085900	"MSFT"
"2016-10-04"	31.35	18460400	"CSCO"
"2016-10-05"	57.64	16726400	"MSFT"
"2016-10-05"	31.59	11808600	"CSCO"
"2016-10-05"	113.05	21453100	"AAPL"

stocks = stocks.pivot(values="Close", index="Symbol", on="Date")
stocks

shape: (3, 4)

Symbol	2016-10-03	2016-10-04	2016-10-05
str	f64	f64	f64
"CSCO"	31.5	31.35	31.59
"AAPL"	112.52	113.0	113.05
"MSFT"	57.42	57.24	57.64

Clean#

Remove duplicate rows#

user_cols = ["user_id", "age", "gender", "occupation", "zip_code"]
users = pl.read_csv("data/movie.user", separator="|", has_header=False)
users.columns = user_cols
users.head()

shape: (5, 5)

user_id	age	gender	occupation	zip_code
i64	i64	str	str	str
1	24	"M"	"technician"	"85711"
2	53	"F"	"other"	"94043"
3	23	"M"	"writer"	"32067"
4	24	"M"	"technician"	"43537"
5	33	"F"	"other"	"15213"

users.shape

(943, 5)

# count the duplicate items (True becomes 1, False becomes 0)
users["zip_code"].is_duplicated().sum()

Detect missing values#

ufo = pl.read_csv("data/ufo.csv")
ufo.tail()

shape: (5, 5)

City	Colors Reported	Shape Reported	State	Time
str	str	str	str	str
"Grant Park"	null	"TRIANGLE"	"IL"	"12/31/2000 23:00"
"Spirit Lake"	null	"DISK"	"IA"	"12/31/2000 23:00"
"Eagle River"	null	null	"WI"	"12/31/2000 23:45"
"Eagle River"	"RED"	"LIGHT"	"WI"	"12/31/2000 23:45"
"Ybor"	null	"OVAL"	"FL"	"12/31/2000 23:59"

ufo.select(pl.all().is_null())

shape: (18_241, 5)

City	Colors Reported	Shape Reported	State	Time
bool	bool	bool	bool	bool
false	true	false	false	false
false	true	false	false	false
false	true	false	false	false
false	true	false	false	false
false	true	false	false	false
…	…	…	…	…
false	true	false	false	false
false	true	false	false	false
false	true	true	false	false
false	false	false	false	false
false	true	false	false	false

ufo.select(pl.all().is_not_null())

shape: (18_241, 5)

City	Colors Reported	Shape Reported	State	Time
bool	bool	bool	bool	bool
true	false	true	true	true
true	false	true	true	true
true	false	true	true	true
true	false	true	true	true
true	false	true	true	true
…	…	…	…	…
true	false	true	true	true
true	false	true	true	true
true	false	false	true	true
true	true	true	true	true
true	false	true	true	true

# count the number of missing values in each Series
ufo.select(ufo.select(pl.all().is_null()).sum())

shape: (1, 5)

City	Colors Reported	Shape Reported	State	Time
u32	u32	u32	u32	u32
25	15359	2644	0	0

# if 'any' values are missing in a row, then drop that row
ufo.drop_nulls().shape

(2486, 5)

# if 'all' values are missing in a row, then drop that row (none are dropped in this case)
ufo.filter(~pl.all_horizontal(pl.all().is_null())).shape

(18241, 5)

# if 'any' values are missing in a row (considering only 'City' and 'Shape Reported'), then drop that row
ufo.filter(~pl.all_horizontal(pl.col(["City", "Shape Reported"]).is_null())).shape

(18237, 5)

# 'value_counts' does not include missing values by default
ufo["Shape Reported"].value_counts()

shape: (28, 2)

Shape Reported	count
str	u32
"CRESCENT"	2
"DELTA"	7
"FLARE"	1
"DIAMOND"	234
"CROSS"	36
…	…
"CIGAR"	617
"VARIOUS"	333
"CHEVRON"	248
"LIGHT"	2803
"FLASH"	188

Fill missing values#

# fill in missing values with a specified value
ufo.with_columns([
    pl.col("Shape Reported").fill_null(value="VARIOUS"),
    pl.col("Colors Reported").fill_null(value="UNKNOWN"),
])

shape: (18_241, 5)

City	Colors Reported	Shape Reported	State	Time
str	str	str	str	str
"Ithaca"	"UNKNOWN"	"TRIANGLE"	"NY"	"6/1/1930 22:00"
"Willingboro"	"UNKNOWN"	"OTHER"	"NJ"	"6/30/1930 20:00"
"Holyoke"	"UNKNOWN"	"OVAL"	"CO"	"2/15/1931 14:00"
"Abilene"	"UNKNOWN"	"DISK"	"KS"	"6/1/1931 13:00"
"New York Worlds Fair"	"UNKNOWN"	"LIGHT"	"NY"	"4/18/1933 19:00"
…	…	…	…	…
"Grant Park"	"UNKNOWN"	"TRIANGLE"	"IL"	"12/31/2000 23:00"
"Spirit Lake"	"UNKNOWN"	"DISK"	"IA"	"12/31/2000 23:00"
"Eagle River"	"UNKNOWN"	"VARIOUS"	"WI"	"12/31/2000 23:45"
"Eagle River"	"RED"	"LIGHT"	"WI"	"12/31/2000 23:45"
"Ybor"	"UNKNOWN"	"OVAL"	"FL"	"12/31/2000 23:59"

# confirm that the missing values were filled in
ufo["Shape Reported"].value_counts().head()

shape: (5, 2)

Shape Reported	count
str	u32
"TEARDROP"	119
"DISK"	2122
"SPHERE"	1054
"CROSS"	36
"CIGAR"	617

# fill missing values using "backward fill" strategy (doesn't affect the DataFrame since inplace=False)
ufo.fill_null(strategy="forward").tail()

shape: (5, 5)

City	Colors Reported	Shape Reported	State	Time
str	str	str	str	str
"Grant Park"	"RED"	"TRIANGLE"	"IL"	"12/31/2000 23:00"
"Spirit Lake"	"RED"	"DISK"	"IA"	"12/31/2000 23:00"
"Eagle River"	"RED"	"DISK"	"WI"	"12/31/2000 23:45"
"Eagle River"	"RED"	"LIGHT"	"WI"	"12/31/2000 23:45"
"Ybor"	"RED"	"OVAL"	"FL"	"12/31/2000 23:59"

# compare with "forward fill" strategy (doesn't affect the DataFrame since inplace=False)
ufo.fill_null(strategy="backward").tail()

shape: (5, 5)

City	Colors Reported	Shape Reported	State	Time
str	str	str	str	str
"Grant Park"	"RED"	"TRIANGLE"	"IL"	"12/31/2000 23:00"
"Spirit Lake"	"RED"	"DISK"	"IA"	"12/31/2000 23:00"
"Eagle River"	"RED"	"LIGHT"	"WI"	"12/31/2000 23:45"
"Eagle River"	"RED"	"LIGHT"	"WI"	"12/31/2000 23:45"
"Ybor"	null	"OVAL"	"FL"	"12/31/2000 23:59"

Outliers (clip)#

drinks["wine_servings"].clip(10, 300)

shape: (193,)

wine_servings
i64
10
54
14
300
45
…
10
10
10
10
10

String & Time#

drinks = pl.read_csv("data/drinks.csv")
drinks.head()

shape: (5, 6)

country	beer_servings	spirit_servings	wine_servings	total_litres_of_pure_alcohol	continent
str	i64	i64	i64	f64	str
"Afghanistan"	0	0	0	0.0	"Asia"
"Albania"	89	132	54	4.9	"Europe"
"Algeria"	25	0	14	0.7	"Africa"
"Andorra"	245	138	312	12.4	"Europe"
"Angola"	217	57	45	5.9	"Africa"

Use string methods#

orders = pl.read_csv("data/chipotle.tsv", separator="\t")
orders.head()

shape: (5, 5)

order_id	quantity	item_name	choice_description	item_price
i64	i64	str	str	str
1	1	"Chips and Fresh Tomato Salsa"	"NULL"	"$2.39"
1	1	"Izze"	"[Clementine]"	"$3.39"
1	1	"Nantucket Nectar"	"[Apple]"	"$3.39"
1	1	"Chips and Tomatillo-Green Chil…	"NULL"	"$2.39"
2	2	"Chicken Bowl"	"[Tomatillo-Red Chili Salsa (Ho…	"$16.98"

# string methods for pandas Series are accessed via 'str'
orders["item_name"].str.to_uppercase().head()

shape: (10,)

item_name
str
"CHIPS AND FRESH TOMATO SALSA"
"IZZE"
"NANTUCKET NECTAR"
"CHIPS AND TOMATILLO-GREEN CHIL…
"CHICKEN BOWL"
"CHICKEN BOWL"
"SIDE OF CHIPS"
"STEAK BURRITO"
"STEAK SOFT TACOS"
"STEAK BURRITO"

# use the boolean Series to filter the DataFrame
orders.filter(pl.col("item_name").str.contains("Chicken")).head()

shape: (5, 5)

order_id	quantity	item_name	choice_description	item_price
i64	i64	str	str	str
2	2	"Chicken Bowl"	"[Tomatillo-Red Chili Salsa (Ho…	"$16.98"
3	1	"Chicken Bowl"	"[Fresh Tomato Salsa (Mild), [R…	"$10.98"
6	1	"Chicken Crispy Tacos"	"[Roasted Chili Corn Salsa, [Fa…	"$8.75"
6	1	"Chicken Soft Tacos"	"[Roasted Chili Corn Salsa, [Ri…	"$8.75"
7	1	"Chicken Bowl"	"[Fresh Tomato Salsa, [Fajita V…	"$11.25"

# many pandas string methods support regular expressions (regex)
orders["choice_description"].str.replace(r"[\[\]]", "").head()

shape: (10,)

choice_description
str
"NULL"
"Clementine]"
"Apple]"
"NULL"
"Tomatillo-Red Chili Salsa (Hot…
"Fresh Tomato Salsa (Mild), [Ri…
"NULL"
"Tomatillo Red Chili Salsa, [Fa…
"Tomatillo Green Chili Salsa, […
"Fresh Tomato Salsa, [Rice, Bla…

Dates & Times#

# read a dataset of UFO reports into a DataFrame
ufo = pl.read_csv("data/ufo.csv")
ufo.head()

shape: (5, 5)

City	Colors Reported	Shape Reported	State	Time
str	str	str	str	str
"Ithaca"	null	"TRIANGLE"	"NY"	"6/1/1930 22:00"
"Willingboro"	null	"OTHER"	"NJ"	"6/30/1930 20:00"
"Holyoke"	null	"OVAL"	"CO"	"2/15/1931 14:00"
"Abilene"	null	"DISK"	"KS"	"6/1/1931 13:00"
"New York Worlds Fair"	null	"LIGHT"	"NY"	"4/18/1933 19:00"

# 'Time' is currently stored as a string
ufo.dtypes

[String, String, String, String, String]

Category data#

drinks = pl.read_csv("data/drinks.csv")
drinks.head()

shape: (5, 6)

country	beer_servings	spirit_servings	wine_servings	total_litres_of_pure_alcohol	continent
str	i64	i64	i64	f64	str
"Afghanistan"	0	0	0	0.0	"Asia"
"Albania"	89	132	54	4.9	"Europe"
"Algeria"	25	0	14	0.7	"Africa"
"Andorra"	245	138	312	12.4	"Europe"
"Angola"	217	57	45	5.9	"Africa"

# use the 'category' data type to store the 'continent' strings as integers
drinks = drinks.with_columns(pl.col("continent").cast(pl.Categorical))
drinks.dtypes

[String, Int64, Int64, Int64, Float64, Categorical]

# strings are now encoded (0 means 'Africa', 1 means 'Asia', 2 means 'Europe', etc.)
drinks["continent"].to_physical().head()

shape: (10,)

continent
u32
0
1
2
1
2
3
4
1
5
1

# repeat this process for the 'country' Series
drinks = drinks.with_columns(pl.col("country").cast(pl.Categorical))
# memory usage increased because we created 193 categories
drinks["country"].cat.get_categories()

shape: (199,)

country
str
"Asia"
"Europe"
"Africa"
"North America"
"South America"
…
"Venezuela"
"Vietnam"
"Yemen"
"Zambia"
"Zimbabwe"

Dummy variables#

train = pl.read_csv("data/titanic_train.csv")
train.head()

shape: (5, 12)

PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked
i64	i64	i64	str	str	f64	i64	i64	str	f64	str	str
1	0	3	"Braund, Mr. Owen Harris"	"male"	22.0	1	0	"A/5 21171"	7.25	null	"S"
2	1	1	"Cumings, Mrs. John Bradley (Fl…	"female"	38.0	1	0	"PC 17599"	71.2833	"C85"	"C"
3	1	3	"Heikkinen, Miss. Laina"	"female"	26.0	0	0	"STON/O2. 3101282"	7.925	null	"S"
4	1	1	"Futrelle, Mrs. Jacques Heath (…	"female"	35.0	1	0	"113803"	53.1	"C123"	"S"
5	0	3	"Allen, Mr. William Henry"	"male"	35.0	0	0	"373450"	8.05	null	"S"

# use 'get_dummies' to create one column for every possible value
train.to_dummies(["Sex"]).head()

shape: (5, 13)

PassengerId	Survived	Pclass	Name	Sex_female	Sex_male	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked
i64	i64	i64	str	u8	u8	f64	i64	i64	str	f64	str	str
1	0	3	"Braund, Mr. Owen Harris"	0	1	22.0	1	0	"A/5 21171"	7.25	null	"S"
2	1	1	"Cumings, Mrs. John Bradley (Fl…	1	0	38.0	1	0	"PC 17599"	71.2833	"C85"	"C"
3	1	3	"Heikkinen, Miss. Laina"	1	0	26.0	0	0	"STON/O2. 3101282"	7.925	null	"S"
4	1	1	"Futrelle, Mrs. Jacques Heath (…	1	0	35.0	1	0	"113803"	53.1	"C123"	"S"
5	0	3	"Allen, Mr. William Henry"	0	1	35.0	0	0	"373450"	8.05	null	"S"

# drop the first dummy variable ('female') using the 'iloc' method
# add a prefix to identify the source of the dummy variables
train.to_dummies(["Sex"], separator=":")[:, 1:].head()

shape: (5, 12)

Survived	Pclass	Name	Sex:female	Sex:male	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked
i64	i64	str	u8	u8	f64	i64	i64	str	f64	str	str
0	3	"Braund, Mr. Owen Harris"	0	1	22.0	1	0	"A/5 21171"	7.25	null	"S"
1	1	"Cumings, Mrs. John Bradley (Fl…	1	0	38.0	1	0	"PC 17599"	71.2833	"C85"	"C"
1	3	"Heikkinen, Miss. Laina"	1	0	26.0	0	0	"STON/O2. 3101282"	7.925	null	"S"
1	1	"Futrelle, Mrs. Jacques Heath (…	1	0	35.0	1	0	"113803"	53.1	"C123"	"S"
0	3	"Allen, Mr. William Henry"	0	1	35.0	0	0	"373450"	8.05	null	"S"

# use 'get_dummies' with a feature that has 3 possible values
train.to_dummies(["Embarked"], separator=":").head(10)

shape: (10, 15)

PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked:C	Embarked:Q	Embarked:S	Embarked:null
i64	i64	i64	str	str	f64	i64	i64	str	f64	str	u8	u8	u8	u8
1	0	3	"Braund, Mr. Owen Harris"	"male"	22.0	1	0	"A/5 21171"	7.25	null	0	0	1	0
2	1	1	"Cumings, Mrs. John Bradley (Fl…	"female"	38.0	1	0	"PC 17599"	71.2833	"C85"	1	0	0	0
3	1	3	"Heikkinen, Miss. Laina"	"female"	26.0	0	0	"STON/O2. 3101282"	7.925	null	0	0	1	0
4	1	1	"Futrelle, Mrs. Jacques Heath (…	"female"	35.0	1	0	"113803"	53.1	"C123"	0	0	1	0
5	0	3	"Allen, Mr. William Henry"	"male"	35.0	0	0	"373450"	8.05	null	0	0	1	0
6	0	3	"Moran, Mr. James"	"male"	null	0	0	"330877"	8.4583	null	0	1	0	0
7	0	1	"McCarthy, Mr. Timothy J"	"male"	54.0	0	0	"17463"	51.8625	"E46"	0	0	1	0
8	0	3	"Palsson, Master. Gosta Leonard"	"male"	2.0	3	1	"349909"	21.075	null	0	0	1	0
9	1	3	"Johnson, Mrs. Oscar W (Elisabe…	"female"	27.0	0	2	"347742"	11.1333	null	0	0	1	0
10	1	2	"Nasser, Mrs. Nicholas (Adele A…	"female"	14.0	1	0	"237736"	30.0708	null	1	0	0	0

# pass the DataFrame to 'get_dummies' and specify which columns to dummy (it drops the original columns)
train.to_dummies(["Sex", "Embarked"]).head()

shape: (5, 16)

PassengerId	Survived	Pclass	Name	Sex_female	Sex_male	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked_C	Embarked_Q	Embarked_S	Embarked_null
i64	i64	i64	str	u8	u8	f64	i64	i64	str	f64	str	u8	u8	u8	u8
1	0	3	"Braund, Mr. Owen Harris"	0	1	22.0	1	0	"A/5 21171"	7.25	null	0	0	1	0
2	1	1	"Cumings, Mrs. John Bradley (Fl…	1	0	38.0	1	0	"PC 17599"	71.2833	"C85"	1	0	0	0
3	1	3	"Heikkinen, Miss. Laina"	1	0	26.0	0	0	"STON/O2. 3101282"	7.925	null	0	0	1	0
4	1	1	"Futrelle, Mrs. Jacques Heath (…	1	0	35.0	1	0	"113803"	53.1	"C123"	0	0	1	0
5	0	3	"Allen, Mr. William Henry"	0	1	35.0	0	0	"373450"	8.05	null	0	0	1	0

# use the 'drop_first' parameter (new in pandas 0.18) to drop the first dummy variable for each feature
train.to_dummies(["Sex", "Embarked"], drop_first=True).head()

shape: (5, 14)

PassengerId	Survived	Pclass	Name	Sex_female	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked_C	Embarked_Q	Embarked_null
i64	i64	i64	str	u8	f64	i64	i64	str	f64	str	u8	u8	u8
1	0	3	"Braund, Mr. Owen Harris"	0	22.0	1	0	"A/5 21171"	7.25	null	0	0	0
2	1	1	"Cumings, Mrs. John Bradley (Fl…	1	38.0	1	0	"PC 17599"	71.2833	"C85"	1	0	0
3	1	3	"Heikkinen, Miss. Laina"	1	26.0	0	0	"STON/O2. 3101282"	7.925	null	0	0	0
4	1	1	"Futrelle, Mrs. Jacques Heath (…	1	35.0	1	0	"113803"	53.1	"C123"	0	0	0
5	0	3	"Allen, Mr. William Henry"	0	35.0	0	0	"373450"	8.05	null	0	0	0

Cardinal data#

# create a small DataFrame from a dictionary
small = pl.DataFrame({
    "ID": [100, 101, 102, 103],
    "quality": ["good", "good", "very good", "excellent"],
})
small.head()

shape: (4, 2)

ID	quality
i64	str
100	"good"
101	"good"
102	"very good"
103	"excellent"

# define a logical ordering for the categories
small.with_columns([pl.col("quality").cast(pl.Categorical("lexical"))])
small["quality"]

shape: (4,)

quality
str
"good"
"good"
"very good"
"excellent"

# sort the DataFrame by the 'quality' Series (logical order)
small.sort(by="quality")

shape: (4, 2)

ID	quality
i64	str
103	"excellent"
100	"good"
101	"good"
102	"very good"

# comparison operators work with ordered categories
small.filter(pl.col("quality") > "good")

shape: (1, 2)

ID	quality
i64	str
102	"very good"

Sample rows#

# use the 'seed' parameter for reproducibility
ufo.sample(n=3, seed=42)

shape: (3, 5)

City	Colors Reported	Shape Reported	State	Time
str	str	str	str	str
"Ellis"	null	"OVAL"	"KS"	"10/15/1999 20:30"
"Cleveland"	null	"CIRCLE"	"OH"	"5/1/1988 16:00"
"Sidney"	null	"OTHER"	"TX"	"11/15/2000 18:00"

# sample 75% of the DataFrame's rows without replacement
train = ufo.sample(fraction=0.75, seed=99)

清洗和预处理

Contents

清洗和预处理#

Explore#

Max & Min#

Count#

Sort#

Transform#

Group#

Apply#

Apply a function#

Pivot#

Clean#

Remove duplicate rows#

Detect missing values#

Fill missing values#

Outliers (clip)#

String & Time#

Use string methods#

Dates & Times#

Category data#

Dummy variables#

Cardinal data#

Sample rows#