清洗和预处理#

import polars as pl

Explore#

drinks = pl.read_csv("data/drinks.csv")
drinks.head()
shape: (5, 6)
countrybeer_servingsspirit_servingswine_servingstotal_litres_of_pure_alcoholcontinent
stri64i64i64f64str
"Afghanistan"0000.0"Asia"
"Albania"89132544.9"Europe"
"Algeria"250140.7"Africa"
"Andorra"24513831212.4"Europe"
"Angola"21757455.9"Africa"

Max & Min#

drinks.top_k(10, by="beer_servings")
shape: (10, 6)
countrybeer_servingsspirit_servingswine_servingstotal_litres_of_pure_alcoholcontinent
stri64i64i64f64str
"Namibia"376316.8"Africa"
"Czech Republic"36117013411.8"Europe"
"Gabon"34798598.9"Africa"
"Germany"34611717511.3"Europe"
"Poland"3432155610.9"Europe"
"Lithuania"3432445612.9"Europe"
"Venezuela"33310037.7"South America"
"Ireland"31311816511.4"Europe"
"Palau"30663236.9"Oceania"
"Romania"29712216710.4"Europe"
drinks["beer_servings"].top_k(10)
shape: (10,)
beer_servings
i64
343
361
347
346
343
376
333
313
306
297

Count#

drinks["continent"].value_counts()
shape: (6, 2)
continentcount
stru32
"North America"23
"Oceania"16
"Europe"45
"Africa"53
"South America"12
"Asia"44
drinks["continent"].value_counts(normalize=True)
shape: (6, 2)
continentproportion
strf64
"Europe"0.233161
"North America"0.119171
"Oceania"0.082902
"South America"0.062176
"Africa"0.274611
"Asia"0.227979
drinks["continent"].value_counts().columns
['continent', 'count']
# any Series can be sorted by its values
drinks["continent"].value_counts().sort(by="count")
shape: (6, 2)
continentcount
stru32
"South America"12
"Oceania"16
"North America"23
"Asia"44
"Europe"45
"Africa"53
drinks["continent"].unique()
shape: (6,)
continent
str
"Oceania"
"North America"
"Africa"
"South America"
"Asia"
"Europe"
drinks["continent"].unique_counts()
shape: (6,)
continent
u32
44
45
53
23
12
16

Sort#

movies = pl.read_csv("data/imdb_1000.csv")
movies.head()
shape: (5, 6)
star_ratingtitlecontent_ratinggenredurationactors_list
f64strstrstri64str
9.3"The Shawshank Redemption""R""Crime"142"[u'Tim Robbins', u'Morgan Free…
9.2"The Godfather""R""Crime"175"[u'Marlon Brando', u'Al Pacino…
9.1"The Godfather: Part II""R""Crime"200"[u'Al Pacino', u'Robert De Nir…
9.0"The Dark Knight""PG-13""Action"152"[u'Christian Bale', u'Heath Le…
8.9"Pulp Fiction""R""Crime"154"[u'John Travolta', u'Uma Thurm…
# sort the 'title' Series in ascending order (returns a Series)
movies["title"].sort().head()
shape: (10,)
title
str
"(500) Days of Summer"
"12 Angry Men"
"12 Years a Slave"
"127 Hours"
"2001: A Space Odyssey"
"2046"
"21 Grams"
"25th Hour"
"28 Days Later..."
"3 Idiots"
# sort in descending order instead
movies["title"].sort(descending=True).head()
shape: (10,)
title
str
"[Rec]"
"Zulu"
"Zombieland"
"Zodiac"
"Zero Dark Thirty"
"Zelig"
"Young Frankenstein"
"Yojimbo"
"Yip Man"
"Ying xiong"
# sort the entire DataFrame by the 'title' Series (returns a DataFrame)
movies.sort(by="title").head()
shape: (5, 6)
star_ratingtitlecontent_ratinggenredurationactors_list
f64strstrstri64str
7.8"(500) Days of Summer""PG-13""Comedy"95"[u'Zooey Deschanel', u'Joseph …
8.9"12 Angry Men""NOT RATED""Drama"96"[u'Henry Fonda', u'Lee J. Cobb…
8.1"12 Years a Slave""R""Biography"134"[u'Chiwetel Ejiofor', u'Michae…
7.6"127 Hours""R""Adventure"94"[u'James Franco', u'Amber Tamb…
8.3"2001: A Space Odyssey""G""Mystery"160"[u'Keir Dullea', u'Gary Lockwo…
# sort in descending order instead
movies.sort(by="title", descending=False).head()
shape: (5, 6)
star_ratingtitlecontent_ratinggenredurationactors_list
f64strstrstri64str
7.8"(500) Days of Summer""PG-13""Comedy"95"[u'Zooey Deschanel', u'Joseph …
8.9"12 Angry Men""NOT RATED""Drama"96"[u'Henry Fonda', u'Lee J. Cobb…
8.1"12 Years a Slave""R""Biography"134"[u'Chiwetel Ejiofor', u'Michae…
7.6"127 Hours""R""Adventure"94"[u'James Franco', u'Amber Tamb…
8.3"2001: A Space Odyssey""G""Mystery"160"[u'Keir Dullea', u'Gary Lockwo…
# sort the DataFrame first by 'content_rating', then by 'duration'
movies.sort(by=["content_rating", "duration"]).head()
shape: (5, 6)
star_ratingtitlecontent_ratinggenredurationactors_list
f64strstrstri64str
8.2"Butch Cassidy and the Sundance…null"Biography"110"[u'Paul Newman', u'Robert Redf…
7.4"True Grit"null"Adventure"128"[u'John Wayne', u'Kim Darby', …
7.7"Where Eagles Dare"null"Action"158"[u'Richard Burton', u'Clint Ea…
7.6"The Jungle Book""APPROVED""Animation"78"[u'Phil Harris', u'Sebastian C…
7.8"Invasion of the Body Snatchers""APPROVED""Horror"80"[u'Kevin McCarthy', u'Dana Wyn…

Transform#

Group#

drinks = pl.read_csv("data/drinks.csv")
drinks.head()
shape: (5, 6)
countrybeer_servingsspirit_servingswine_servingstotal_litres_of_pure_alcoholcontinent
stri64i64i64f64str
"Afghanistan"0000.0"Asia"
"Albania"89132544.9"Europe"
"Algeria"250140.7"Africa"
"Andorra"24513831212.4"Europe"
"Angola"21757455.9"Africa"
# calculate the mean beer servings for each continent
drinks.group_by("continent").mean()["beer_servings"]
shape: (6,)
beer_servings
f64
89.6875
193.777778
145.434783
175.083333
37.045455
61.471698
# other aggregation functions (such as 'max') can also be used with groupby
drinks.group_by("continent").max()["beer_servings"]
shape: (6,)
beer_servings
i64
306
285
247
361
376
333
# multiple aggregation functions can be applied simultaneously
drinks.group_by("continent").agg(
    pl.col("beer_servings").count().name.suffix("_count"),
    pl.col("beer_servings").mean().name.suffix("_mean"),
    pl.col("beer_servings").min().name.suffix("_min"),
    pl.col("beer_servings").max().name.suffix("_max"),
)
shape: (6, 5)
continentbeer_servings_countbeer_servings_meanbeer_servings_minbeer_servings_max
stru32f64i64i64
"South America"12175.08333393333
"Africa"5361.4716980376
"Europe"45193.7777780361
"North America"23145.4347831285
"Oceania"1689.68750306
"Asia"4437.0454550247

Apply#

drinks.group_by("continent").mean()
shape: (6, 6)
continentcountrybeer_servingsspirit_servingswine_servingstotal_litres_of_pure_alcohol
strstrf64f64f64f64
"North America"null145.434783165.7391324.5217395.995652
"Africa"null61.47169816.33962316.2641513.007547
"Asia"null37.04545560.8409099.0681822.170455
"South America"null175.083333114.7562.4166676.308333
"Oceania"null89.687558.437535.6253.38125
"Europe"null193.777778132.555556142.2222228.617778
iris = pl.read_csv("data/iris.csv")
iris.head()
shape: (5, 5)
sepal_lengthsepal_widthpetal_lengthpetal_widthspecies
f64f64f64f64str
5.13.51.40.2"setosa"
4.93.01.40.2"setosa"
4.73.21.30.2"setosa"
4.63.11.50.2"setosa"
5.03.61.40.2"setosa"
iris.group_by("species").len()
shape: (3, 2)
specieslen
stru32
"virginica"50
"setosa"50
"versicolor"50
iris.group_by("species").agg([
    pl.min("sepal_length").name.suffix("_min"),
    pl.max("sepal_length").name.suffix("_max"),
    pl.mean("petal_length").name.suffix("_mean"),
    pl.std("petal_length").name.suffix("_std"),
])
shape: (3, 5)
speciessepal_length_minsepal_length_maxpetal_length_meanpetal_length_std
strf64f64f64f64
"virginica"4.97.95.5520.551895
"setosa"4.35.81.4620.173664
"versicolor"4.97.04.260.469911

Apply a function#

train = pl.read_csv("data/titanic_train.csv")
train.head()
shape: (5, 12)
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
i64i64i64strstrf64i64i64strf64strstr
103"Braund, Mr. Owen Harris""male"22.010"A/5 21171"7.25null"S"
211"Cumings, Mrs. John Bradley (Fl…"female"38.010"PC 17599"71.2833"C85""C"
313"Heikkinen, Miss. Laina""female"26.000"STON/O2. 3101282"7.925null"S"
411"Futrelle, Mrs. Jacques Heath (…"female"35.010"113803"53.1"C123""S"
503"Allen, Mr. William Henry""male"35.000"373450"8.05null"S"
# map 'female' to 0 and 'male' to 1
train = train.with_columns(Sex_num=pl.col("Sex").replace({"female": 0, "male": 1}))
train[0:4, ["Sex", "Sex_num"]]
shape: (4, 2)
SexSex_num
strstr
"male""1"
"female""0"
"female""0"
"female""0"
# calculate the length of each string in the 'Name' Series
train = train.with_columns(Name_length=pl.col("Name").len())
train[0:4, ["Name", "Name_length"]]
shape: (4, 2)
NameName_length
stru32
"Braund, Mr. Owen Harris"891
"Cumings, Mrs. John Bradley (Fl…891
"Heikkinen, Miss. Laina"891
"Futrelle, Mrs. Jacques Heath (…891
# use a string method to split the 'Name' Series at commas (returns a Series of lists)
train["Name"].str.split(",").head()
shape: (10,)
Name
list[str]
["Braund", " Mr. Owen Harris"]
["Cumings", " Mrs. John Bradley (Florence Briggs Thayer)"]
["Heikkinen", " Miss. Laina"]
["Futrelle", " Mrs. Jacques Heath (Lily May Peel)"]
["Allen", " Mr. William Henry"]
["Moran", " Mr. James"]
["McCarthy", " Mr. Timothy J"]
["Palsson", " Master. Gosta Leonard"]
["Johnson", " Mrs. Oscar W (Elisabeth Vilhelmina Berg)"]
["Nasser", " Mrs. Nicholas (Adele Achem)"]
#  use a lambda function
train["Name"].str.split(",").map_elements(lambda x: x[0], return_dtype=pl.String).head()
shape: (10,)
Name
str
"Braund"
"Cumings"
"Heikkinen"
"Futrelle"
"Allen"
"Moran"
"McCarthy"
"Palsson"
"Johnson"
"Nasser"

Pivot#

stocks = pl.read_csv("data/stocks.csv")
stocks
shape: (9, 4)
DateCloseVolumeSymbol
strf64i64str
"2016-10-03"31.514070500"CSCO"
"2016-10-03"112.5221701800"AAPL"
"2016-10-03"57.4219189500"MSFT"
"2016-10-04"113.029736800"AAPL"
"2016-10-04"57.2420085900"MSFT"
"2016-10-04"31.3518460400"CSCO"
"2016-10-05"57.6416726400"MSFT"
"2016-10-05"31.5911808600"CSCO"
"2016-10-05"113.0521453100"AAPL"
stocks = stocks.pivot(values="Close", index="Symbol", on="Date")
stocks
shape: (3, 4)
Symbol2016-10-032016-10-042016-10-05
strf64f64f64
"CSCO"31.531.3531.59
"AAPL"112.52113.0113.05
"MSFT"57.4257.2457.64

Clean#

Remove duplicate rows#

user_cols = ["user_id", "age", "gender", "occupation", "zip_code"]
users = pl.read_csv("data/movie.user", separator="|", has_header=False)
users.columns = user_cols
users.head()
shape: (5, 5)
user_idagegenderoccupationzip_code
i64i64strstrstr
124"M""technician""85711"
253"F""other""94043"
323"M""writer""32067"
424"M""technician""43537"
533"F""other""15213"
users.shape
(943, 5)
# count the duplicate items (True becomes 1, False becomes 0)
users["zip_code"].is_duplicated().sum()
250

Detect missing values#

ufo = pl.read_csv("data/ufo.csv")
ufo.tail()
shape: (5, 5)
CityColors ReportedShape ReportedStateTime
strstrstrstrstr
"Grant Park"null"TRIANGLE""IL""12/31/2000 23:00"
"Spirit Lake"null"DISK""IA""12/31/2000 23:00"
"Eagle River"nullnull"WI""12/31/2000 23:45"
"Eagle River""RED""LIGHT""WI""12/31/2000 23:45"
"Ybor"null"OVAL""FL""12/31/2000 23:59"
ufo.select(pl.all().is_null())
shape: (18_241, 5)
CityColors ReportedShape ReportedStateTime
boolboolboolboolbool
falsetruefalsefalsefalse
falsetruefalsefalsefalse
falsetruefalsefalsefalse
falsetruefalsefalsefalse
falsetruefalsefalsefalse
falsetruefalsefalsefalse
falsetruefalsefalsefalse
falsetruetruefalsefalse
falsefalsefalsefalsefalse
falsetruefalsefalsefalse
ufo.select(pl.all().is_not_null())
shape: (18_241, 5)
CityColors ReportedShape ReportedStateTime
boolboolboolboolbool
truefalsetruetruetrue
truefalsetruetruetrue
truefalsetruetruetrue
truefalsetruetruetrue
truefalsetruetruetrue
truefalsetruetruetrue
truefalsetruetruetrue
truefalsefalsetruetrue
truetruetruetruetrue
truefalsetruetruetrue
# count the number of missing values in each Series
ufo.select(ufo.select(pl.all().is_null()).sum())
shape: (1, 5)
CityColors ReportedShape ReportedStateTime
u32u32u32u32u32
2515359264400
# if 'any' values are missing in a row, then drop that row
ufo.drop_nulls().shape
(2486, 5)
# if 'all' values are missing in a row, then drop that row (none are dropped in this case)
ufo.filter(~pl.all_horizontal(pl.all().is_null())).shape
(18241, 5)
# if 'any' values are missing in a row (considering only 'City' and 'Shape Reported'), then drop that row
ufo.filter(~pl.all_horizontal(pl.col(["City", "Shape Reported"]).is_null())).shape
(18237, 5)
# 'value_counts' does not include missing values by default
ufo["Shape Reported"].value_counts()
shape: (28, 2)
Shape Reportedcount
stru32
"CONE"60
"FLASH"188
"SPHERE"1054
"CROSS"36
"EGG"197
"VARIOUS"333
"FLARE"1
null2644
"TEARDROP"119
"CRESCENT"2

Fill missing values#

# fill in missing values with a specified value
ufo.with_columns([
    pl.col("Shape Reported").fill_null(value="VARIOUS"),
    pl.col("Colors Reported").fill_null(value="UNKNOWN"),
])
shape: (18_241, 5)
CityColors ReportedShape ReportedStateTime
strstrstrstrstr
"Ithaca""UNKNOWN""TRIANGLE""NY""6/1/1930 22:00"
"Willingboro""UNKNOWN""OTHER""NJ""6/30/1930 20:00"
"Holyoke""UNKNOWN""OVAL""CO""2/15/1931 14:00"
"Abilene""UNKNOWN""DISK""KS""6/1/1931 13:00"
"New York Worlds Fair""UNKNOWN""LIGHT""NY""4/18/1933 19:00"
"Grant Park""UNKNOWN""TRIANGLE""IL""12/31/2000 23:00"
"Spirit Lake""UNKNOWN""DISK""IA""12/31/2000 23:00"
"Eagle River""UNKNOWN""VARIOUS""WI""12/31/2000 23:45"
"Eagle River""RED""LIGHT""WI""12/31/2000 23:45"
"Ybor""UNKNOWN""OVAL""FL""12/31/2000 23:59"
# confirm that the missing values were filled in
ufo["Shape Reported"].value_counts().head()
shape: (5, 2)
Shape Reportedcount
stru32
"CONE"60
"CIGAR"617
"OTHER"1402
"CRESCENT"2
"LIGHT"2803
# fill missing values using "backward fill" strategy (doesn't affect the DataFrame since inplace=False)
ufo.fill_null(strategy="forward").tail()
shape: (5, 5)
CityColors ReportedShape ReportedStateTime
strstrstrstrstr
"Grant Park""RED""TRIANGLE""IL""12/31/2000 23:00"
"Spirit Lake""RED""DISK""IA""12/31/2000 23:00"
"Eagle River""RED""DISK""WI""12/31/2000 23:45"
"Eagle River""RED""LIGHT""WI""12/31/2000 23:45"
"Ybor""RED""OVAL""FL""12/31/2000 23:59"
# compare with "forward fill" strategy (doesn't affect the DataFrame since inplace=False)
ufo.fill_null(strategy="backward").tail()
shape: (5, 5)
CityColors ReportedShape ReportedStateTime
strstrstrstrstr
"Grant Park""RED""TRIANGLE""IL""12/31/2000 23:00"
"Spirit Lake""RED""DISK""IA""12/31/2000 23:00"
"Eagle River""RED""LIGHT""WI""12/31/2000 23:45"
"Eagle River""RED""LIGHT""WI""12/31/2000 23:45"
"Ybor"null"OVAL""FL""12/31/2000 23:59"

Outliers (clip)#

drinks["wine_servings"].clip(10, 300)
shape: (193,)
wine_servings
i64
10
54
14
300
45
10
10
10
10
10

String & Time#

drinks = pl.read_csv("data/drinks.csv")
drinks.head()
shape: (5, 6)
countrybeer_servingsspirit_servingswine_servingstotal_litres_of_pure_alcoholcontinent
stri64i64i64f64str
"Afghanistan"0000.0"Asia"
"Albania"89132544.9"Europe"
"Algeria"250140.7"Africa"
"Andorra"24513831212.4"Europe"
"Angola"21757455.9"Africa"

Use string methods#

orders = pl.read_csv("data/chipotle.tsv", separator="\t")
orders.head()
shape: (5, 5)
order_idquantityitem_namechoice_descriptionitem_price
i64i64strstrstr
11"Chips and Fresh Tomato Salsa""NULL""$2.39"
11"Izze""[Clementine]""$3.39"
11"Nantucket Nectar""[Apple]""$3.39"
11"Chips and Tomatillo-Green Chil…"NULL""$2.39"
22"Chicken Bowl""[Tomatillo-Red Chili Salsa (Ho…"$16.98"
# string methods for pandas Series are accessed via 'str'
orders["item_name"].str.to_uppercase().head()
shape: (10,)
item_name
str
"CHIPS AND FRESH TOMATO SALSA"
"IZZE"
"NANTUCKET NECTAR"
"CHIPS AND TOMATILLO-GREEN CHIL…
"CHICKEN BOWL"
"CHICKEN BOWL"
"SIDE OF CHIPS"
"STEAK BURRITO"
"STEAK SOFT TACOS"
"STEAK BURRITO"
# use the boolean Series to filter the DataFrame
orders.filter(pl.col("item_name").str.contains("Chicken")).head()
shape: (5, 5)
order_idquantityitem_namechoice_descriptionitem_price
i64i64strstrstr
22"Chicken Bowl""[Tomatillo-Red Chili Salsa (Ho…"$16.98"
31"Chicken Bowl""[Fresh Tomato Salsa (Mild), [R…"$10.98"
61"Chicken Crispy Tacos""[Roasted Chili Corn Salsa, [Fa…"$8.75"
61"Chicken Soft Tacos""[Roasted Chili Corn Salsa, [Ri…"$8.75"
71"Chicken Bowl""[Fresh Tomato Salsa, [Fajita V…"$11.25"
# many pandas string methods support regular expressions (regex)
orders["choice_description"].str.replace(r"[\[\]]", "").head()
shape: (10,)
choice_description
str
"NULL"
"Clementine]"
"Apple]"
"NULL"
"Tomatillo-Red Chili Salsa (Hot…
"Fresh Tomato Salsa (Mild), [Ri…
"NULL"
"Tomatillo Red Chili Salsa, [Fa…
"Tomatillo Green Chili Salsa, […
"Fresh Tomato Salsa, [Rice, Bla…

Dates & Times#

# read a dataset of UFO reports into a DataFrame
ufo = pl.read_csv("data/ufo.csv")
ufo.head()
shape: (5, 5)
CityColors ReportedShape ReportedStateTime
strstrstrstrstr
"Ithaca"null"TRIANGLE""NY""6/1/1930 22:00"
"Willingboro"null"OTHER""NJ""6/30/1930 20:00"
"Holyoke"null"OVAL""CO""2/15/1931 14:00"
"Abilene"null"DISK""KS""6/1/1931 13:00"
"New York Worlds Fair"null"LIGHT""NY""4/18/1933 19:00"
# 'Time' is currently stored as a string
ufo.dtypes
[String, String, String, String, String]

Category data#

drinks = pl.read_csv("data/drinks.csv")
drinks.head()
shape: (5, 6)
countrybeer_servingsspirit_servingswine_servingstotal_litres_of_pure_alcoholcontinent
stri64i64i64f64str
"Afghanistan"0000.0"Asia"
"Albania"89132544.9"Europe"
"Algeria"250140.7"Africa"
"Andorra"24513831212.4"Europe"
"Angola"21757455.9"Africa"
# use the 'category' data type to store the 'continent' strings as integers
drinks = drinks.with_columns(pl.col("continent").cast(pl.Categorical))
drinks.dtypes
[String, Int64, Int64, Int64, Float64, Categorical(ordering='physical')]
# strings are now encoded (0 means 'Africa', 1 means 'Asia', 2 means 'Europe', etc.)
drinks["continent"].to_physical().head()
shape: (10,)
continent
u32
0
1
2
1
2
3
4
1
5
1
# repeat this process for the 'country' Series
drinks = drinks.with_columns(pl.col("country").cast(pl.Categorical))
# memory usage increased because we created 193 categories
drinks["country"].cat.get_categories()
shape: (193,)
country
str
"Afghanistan"
"Albania"
"Algeria"
"Andorra"
"Angola"
"Venezuela"
"Vietnam"
"Yemen"
"Zambia"
"Zimbabwe"

Dummy variables#

train = pl.read_csv("data/titanic_train.csv")
train.head()
shape: (5, 12)
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
i64i64i64strstrf64i64i64strf64strstr
103"Braund, Mr. Owen Harris""male"22.010"A/5 21171"7.25null"S"
211"Cumings, Mrs. John Bradley (Fl…"female"38.010"PC 17599"71.2833"C85""C"
313"Heikkinen, Miss. Laina""female"26.000"STON/O2. 3101282"7.925null"S"
411"Futrelle, Mrs. Jacques Heath (…"female"35.010"113803"53.1"C123""S"
503"Allen, Mr. William Henry""male"35.000"373450"8.05null"S"
# use 'get_dummies' to create one column for every possible value
train.to_dummies(["Sex"]).head()
shape: (5, 13)
PassengerIdSurvivedPclassNameSex_femaleSex_maleAgeSibSpParchTicketFareCabinEmbarked
i64i64i64stru8u8f64i64i64strf64strstr
103"Braund, Mr. Owen Harris"0122.010"A/5 21171"7.25null"S"
211"Cumings, Mrs. John Bradley (Fl…1038.010"PC 17599"71.2833"C85""C"
313"Heikkinen, Miss. Laina"1026.000"STON/O2. 3101282"7.925null"S"
411"Futrelle, Mrs. Jacques Heath (…1035.010"113803"53.1"C123""S"
503"Allen, Mr. William Henry"0135.000"373450"8.05null"S"
# drop the first dummy variable ('female') using the 'iloc' method
# add a prefix to identify the source of the dummy variables
train.to_dummies(["Sex"], separator=":")[:, 1:].head()
shape: (5, 12)
SurvivedPclassNameSex:femaleSex:maleAgeSibSpParchTicketFareCabinEmbarked
i64i64stru8u8f64i64i64strf64strstr
03"Braund, Mr. Owen Harris"0122.010"A/5 21171"7.25null"S"
11"Cumings, Mrs. John Bradley (Fl…1038.010"PC 17599"71.2833"C85""C"
13"Heikkinen, Miss. Laina"1026.000"STON/O2. 3101282"7.925null"S"
11"Futrelle, Mrs. Jacques Heath (…1035.010"113803"53.1"C123""S"
03"Allen, Mr. William Henry"0135.000"373450"8.05null"S"
# use 'get_dummies' with a feature that has 3 possible values
train.to_dummies(["Embarked"], separator=":").head(10)
shape: (10, 15)
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked:CEmbarked:QEmbarked:SEmbarked:null
i64i64i64strstrf64i64i64strf64stru8u8u8u8
103"Braund, Mr. Owen Harris""male"22.010"A/5 21171"7.25null0010
211"Cumings, Mrs. John Bradley (Fl…"female"38.010"PC 17599"71.2833"C85"1000
313"Heikkinen, Miss. Laina""female"26.000"STON/O2. 3101282"7.925null0010
411"Futrelle, Mrs. Jacques Heath (…"female"35.010"113803"53.1"C123"0010
503"Allen, Mr. William Henry""male"35.000"373450"8.05null0010
603"Moran, Mr. James""male"null00"330877"8.4583null0100
701"McCarthy, Mr. Timothy J""male"54.000"17463"51.8625"E46"0010
803"Palsson, Master. Gosta Leonard""male"2.031"349909"21.075null0010
913"Johnson, Mrs. Oscar W (Elisabe…"female"27.002"347742"11.1333null0010
1012"Nasser, Mrs. Nicholas (Adele A…"female"14.010"237736"30.0708null1000
# pass the DataFrame to 'get_dummies' and specify which columns to dummy (it drops the original columns)
train.to_dummies(["Sex", "Embarked"]).head()
shape: (5, 16)
PassengerIdSurvivedPclassNameSex_femaleSex_maleAgeSibSpParchTicketFareCabinEmbarked_CEmbarked_QEmbarked_SEmbarked_null
i64i64i64stru8u8f64i64i64strf64stru8u8u8u8
103"Braund, Mr. Owen Harris"0122.010"A/5 21171"7.25null0010
211"Cumings, Mrs. John Bradley (Fl…1038.010"PC 17599"71.2833"C85"1000
313"Heikkinen, Miss. Laina"1026.000"STON/O2. 3101282"7.925null0010
411"Futrelle, Mrs. Jacques Heath (…1035.010"113803"53.1"C123"0010
503"Allen, Mr. William Henry"0135.000"373450"8.05null0010
# use the 'drop_first' parameter (new in pandas 0.18) to drop the first dummy variable for each feature
train.to_dummies(["Sex", "Embarked"], drop_first=True).head()
shape: (5, 14)
PassengerIdSurvivedPclassNameSex_femaleAgeSibSpParchTicketFareCabinEmbarked_CEmbarked_QEmbarked_null
i64i64i64stru8f64i64i64strf64stru8u8u8
103"Braund, Mr. Owen Harris"022.010"A/5 21171"7.25null000
211"Cumings, Mrs. John Bradley (Fl…138.010"PC 17599"71.2833"C85"100
313"Heikkinen, Miss. Laina"126.000"STON/O2. 3101282"7.925null000
411"Futrelle, Mrs. Jacques Heath (…135.010"113803"53.1"C123"000
503"Allen, Mr. William Henry"035.000"373450"8.05null000

Cardinal data#

# create a small DataFrame from a dictionary
small = pl.DataFrame({
    "ID": [100, 101, 102, 103],
    "quality": ["good", "good", "very good", "excellent"],
})
small.head()
shape: (4, 2)
IDquality
i64str
100"good"
101"good"
102"very good"
103"excellent"
# define a logical ordering for the categories
small.with_columns([pl.col("quality").cast(pl.Categorical("lexical"))])
small["quality"]
shape: (4,)
quality
str
"good"
"good"
"very good"
"excellent"
# sort the DataFrame by the 'quality' Series (logical order)
small.sort(by="quality")
shape: (4, 2)
IDquality
i64str
103"excellent"
100"good"
101"good"
102"very good"
# comparison operators work with ordered categories
small.filter(pl.col("quality") > "good")
shape: (1, 2)
IDquality
i64str
102"very good"

Sample rows#

# use the 'seed' parameter for reproducibility
ufo.sample(n=3, seed=42)
shape: (3, 5)
CityColors ReportedShape ReportedStateTime
strstrstrstrstr
"Kennett Square"null"OTHER""PA""6/1/1975 22:00"
"Landaff"null"TRIANGLE""NH""11/1/1986 22:50"
"Mercedes"null"OVAL""TX""2/14/1986 8:15"
# sample 75% of the DataFrame's rows without replacement
train = ufo.sample(fraction=0.75, seed=99)