清洗和预处理#
import polars as pl
Explore#
drinks = pl.read_csv("data/drinks.csv")
drinks.head()
shape: (5, 6)
country | beer_servings | spirit_servings | wine_servings | total_litres_of_pure_alcohol | continent |
---|---|---|---|---|---|
str | i64 | i64 | i64 | f64 | str |
"Afghanistan" | 0 | 0 | 0 | 0.0 | "Asia" |
"Albania" | 89 | 132 | 54 | 4.9 | "Europe" |
"Algeria" | 25 | 0 | 14 | 0.7 | "Africa" |
"Andorra" | 245 | 138 | 312 | 12.4 | "Europe" |
"Angola" | 217 | 57 | 45 | 5.9 | "Africa" |
Max & Min#
drinks.top_k(10, by="beer_servings")
shape: (10, 6)
country | beer_servings | spirit_servings | wine_servings | total_litres_of_pure_alcohol | continent |
---|---|---|---|---|---|
str | i64 | i64 | i64 | f64 | str |
"Namibia" | 376 | 3 | 1 | 6.8 | "Africa" |
"Czech Republic" | 361 | 170 | 134 | 11.8 | "Europe" |
"Gabon" | 347 | 98 | 59 | 8.9 | "Africa" |
"Germany" | 346 | 117 | 175 | 11.3 | "Europe" |
"Poland" | 343 | 215 | 56 | 10.9 | "Europe" |
"Lithuania" | 343 | 244 | 56 | 12.9 | "Europe" |
"Venezuela" | 333 | 100 | 3 | 7.7 | "South America" |
"Ireland" | 313 | 118 | 165 | 11.4 | "Europe" |
"Palau" | 306 | 63 | 23 | 6.9 | "Oceania" |
"Romania" | 297 | 122 | 167 | 10.4 | "Europe" |
drinks["beer_servings"].top_k(10)
shape: (10,)
beer_servings |
---|
i64 |
343 |
361 |
347 |
346 |
343 |
376 |
333 |
313 |
306 |
297 |
Count#
drinks["continent"].value_counts()
shape: (6, 2)
continent | count |
---|---|
str | u32 |
"North America" | 23 |
"Oceania" | 16 |
"Europe" | 45 |
"Africa" | 53 |
"South America" | 12 |
"Asia" | 44 |
drinks["continent"].value_counts(normalize=True)
shape: (6, 2)
continent | proportion |
---|---|
str | f64 |
"Europe" | 0.233161 |
"North America" | 0.119171 |
"Oceania" | 0.082902 |
"South America" | 0.062176 |
"Africa" | 0.274611 |
"Asia" | 0.227979 |
drinks["continent"].value_counts().columns
['continent', 'count']
# any Series can be sorted by its values
drinks["continent"].value_counts().sort(by="count")
shape: (6, 2)
continent | count |
---|---|
str | u32 |
"South America" | 12 |
"Oceania" | 16 |
"North America" | 23 |
"Asia" | 44 |
"Europe" | 45 |
"Africa" | 53 |
drinks["continent"].unique()
shape: (6,)
continent |
---|
str |
"Oceania" |
"North America" |
"Africa" |
"South America" |
"Asia" |
"Europe" |
drinks["continent"].unique_counts()
shape: (6,)
continent |
---|
u32 |
44 |
45 |
53 |
23 |
12 |
16 |
Sort#
movies = pl.read_csv("data/imdb_1000.csv")
movies.head()
shape: (5, 6)
star_rating | title | content_rating | genre | duration | actors_list |
---|---|---|---|---|---|
f64 | str | str | str | i64 | str |
9.3 | "The Shawshank Redemption" | "R" | "Crime" | 142 | "[u'Tim Robbins', u'Morgan Free… |
9.2 | "The Godfather" | "R" | "Crime" | 175 | "[u'Marlon Brando', u'Al Pacino… |
9.1 | "The Godfather: Part II" | "R" | "Crime" | 200 | "[u'Al Pacino', u'Robert De Nir… |
9.0 | "The Dark Knight" | "PG-13" | "Action" | 152 | "[u'Christian Bale', u'Heath Le… |
8.9 | "Pulp Fiction" | "R" | "Crime" | 154 | "[u'John Travolta', u'Uma Thurm… |
# sort the 'title' Series in ascending order (returns a Series)
movies["title"].sort().head()
shape: (10,)
title |
---|
str |
"(500) Days of Summer" |
"12 Angry Men" |
"12 Years a Slave" |
"127 Hours" |
"2001: A Space Odyssey" |
"2046" |
"21 Grams" |
"25th Hour" |
"28 Days Later..." |
"3 Idiots" |
# sort in descending order instead
movies["title"].sort(descending=True).head()
shape: (10,)
title |
---|
str |
"[Rec]" |
"Zulu" |
"Zombieland" |
"Zodiac" |
"Zero Dark Thirty" |
"Zelig" |
"Young Frankenstein" |
"Yojimbo" |
"Yip Man" |
"Ying xiong" |
# sort the entire DataFrame by the 'title' Series (returns a DataFrame)
movies.sort(by="title").head()
shape: (5, 6)
star_rating | title | content_rating | genre | duration | actors_list |
---|---|---|---|---|---|
f64 | str | str | str | i64 | str |
7.8 | "(500) Days of Summer" | "PG-13" | "Comedy" | 95 | "[u'Zooey Deschanel', u'Joseph … |
8.9 | "12 Angry Men" | "NOT RATED" | "Drama" | 96 | "[u'Henry Fonda', u'Lee J. Cobb… |
8.1 | "12 Years a Slave" | "R" | "Biography" | 134 | "[u'Chiwetel Ejiofor', u'Michae… |
7.6 | "127 Hours" | "R" | "Adventure" | 94 | "[u'James Franco', u'Amber Tamb… |
8.3 | "2001: A Space Odyssey" | "G" | "Mystery" | 160 | "[u'Keir Dullea', u'Gary Lockwo… |
# sort in descending order instead
movies.sort(by="title", descending=False).head()
shape: (5, 6)
star_rating | title | content_rating | genre | duration | actors_list |
---|---|---|---|---|---|
f64 | str | str | str | i64 | str |
7.8 | "(500) Days of Summer" | "PG-13" | "Comedy" | 95 | "[u'Zooey Deschanel', u'Joseph … |
8.9 | "12 Angry Men" | "NOT RATED" | "Drama" | 96 | "[u'Henry Fonda', u'Lee J. Cobb… |
8.1 | "12 Years a Slave" | "R" | "Biography" | 134 | "[u'Chiwetel Ejiofor', u'Michae… |
7.6 | "127 Hours" | "R" | "Adventure" | 94 | "[u'James Franco', u'Amber Tamb… |
8.3 | "2001: A Space Odyssey" | "G" | "Mystery" | 160 | "[u'Keir Dullea', u'Gary Lockwo… |
# sort the DataFrame first by 'content_rating', then by 'duration'
movies.sort(by=["content_rating", "duration"]).head()
shape: (5, 6)
star_rating | title | content_rating | genre | duration | actors_list |
---|---|---|---|---|---|
f64 | str | str | str | i64 | str |
8.2 | "Butch Cassidy and the Sundance… | null | "Biography" | 110 | "[u'Paul Newman', u'Robert Redf… |
7.4 | "True Grit" | null | "Adventure" | 128 | "[u'John Wayne', u'Kim Darby', … |
7.7 | "Where Eagles Dare" | null | "Action" | 158 | "[u'Richard Burton', u'Clint Ea… |
7.6 | "The Jungle Book" | "APPROVED" | "Animation" | 78 | "[u'Phil Harris', u'Sebastian C… |
7.8 | "Invasion of the Body Snatchers" | "APPROVED" | "Horror" | 80 | "[u'Kevin McCarthy', u'Dana Wyn… |
Transform#
Group#
drinks = pl.read_csv("data/drinks.csv")
drinks.head()
shape: (5, 6)
country | beer_servings | spirit_servings | wine_servings | total_litres_of_pure_alcohol | continent |
---|---|---|---|---|---|
str | i64 | i64 | i64 | f64 | str |
"Afghanistan" | 0 | 0 | 0 | 0.0 | "Asia" |
"Albania" | 89 | 132 | 54 | 4.9 | "Europe" |
"Algeria" | 25 | 0 | 14 | 0.7 | "Africa" |
"Andorra" | 245 | 138 | 312 | 12.4 | "Europe" |
"Angola" | 217 | 57 | 45 | 5.9 | "Africa" |
# calculate the mean beer servings for each continent
drinks.group_by("continent").mean()["beer_servings"]
shape: (6,)
beer_servings |
---|
f64 |
89.6875 |
193.777778 |
145.434783 |
175.083333 |
37.045455 |
61.471698 |
# other aggregation functions (such as 'max') can also be used with groupby
drinks.group_by("continent").max()["beer_servings"]
shape: (6,)
beer_servings |
---|
i64 |
306 |
285 |
247 |
361 |
376 |
333 |
# multiple aggregation functions can be applied simultaneously
drinks.group_by("continent").agg(
pl.col("beer_servings").count().name.suffix("_count"),
pl.col("beer_servings").mean().name.suffix("_mean"),
pl.col("beer_servings").min().name.suffix("_min"),
pl.col("beer_servings").max().name.suffix("_max"),
)
shape: (6, 5)
continent | beer_servings_count | beer_servings_mean | beer_servings_min | beer_servings_max |
---|---|---|---|---|
str | u32 | f64 | i64 | i64 |
"South America" | 12 | 175.083333 | 93 | 333 |
"Africa" | 53 | 61.471698 | 0 | 376 |
"Europe" | 45 | 193.777778 | 0 | 361 |
"North America" | 23 | 145.434783 | 1 | 285 |
"Oceania" | 16 | 89.6875 | 0 | 306 |
"Asia" | 44 | 37.045455 | 0 | 247 |
Apply#
drinks.group_by("continent").mean()
shape: (6, 6)
continent | country | beer_servings | spirit_servings | wine_servings | total_litres_of_pure_alcohol |
---|---|---|---|---|---|
str | str | f64 | f64 | f64 | f64 |
"North America" | null | 145.434783 | 165.73913 | 24.521739 | 5.995652 |
"Africa" | null | 61.471698 | 16.339623 | 16.264151 | 3.007547 |
"Asia" | null | 37.045455 | 60.840909 | 9.068182 | 2.170455 |
"South America" | null | 175.083333 | 114.75 | 62.416667 | 6.308333 |
"Oceania" | null | 89.6875 | 58.4375 | 35.625 | 3.38125 |
"Europe" | null | 193.777778 | 132.555556 | 142.222222 | 8.617778 |
iris = pl.read_csv("data/iris.csv")
iris.head()
shape: (5, 5)
sepal_length | sepal_width | petal_length | petal_width | species |
---|---|---|---|---|
f64 | f64 | f64 | f64 | str |
5.1 | 3.5 | 1.4 | 0.2 | "setosa" |
4.9 | 3.0 | 1.4 | 0.2 | "setosa" |
4.7 | 3.2 | 1.3 | 0.2 | "setosa" |
4.6 | 3.1 | 1.5 | 0.2 | "setosa" |
5.0 | 3.6 | 1.4 | 0.2 | "setosa" |
iris.group_by("species").len()
shape: (3, 2)
species | len |
---|---|
str | u32 |
"virginica" | 50 |
"setosa" | 50 |
"versicolor" | 50 |
iris.group_by("species").agg([
pl.min("sepal_length").name.suffix("_min"),
pl.max("sepal_length").name.suffix("_max"),
pl.mean("petal_length").name.suffix("_mean"),
pl.std("petal_length").name.suffix("_std"),
])
shape: (3, 5)
species | sepal_length_min | sepal_length_max | petal_length_mean | petal_length_std |
---|---|---|---|---|
str | f64 | f64 | f64 | f64 |
"virginica" | 4.9 | 7.9 | 5.552 | 0.551895 |
"setosa" | 4.3 | 5.8 | 1.462 | 0.173664 |
"versicolor" | 4.9 | 7.0 | 4.26 | 0.469911 |
Apply a function#
train = pl.read_csv("data/titanic_train.csv")
train.head()
shape: (5, 12)
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked |
---|---|---|---|---|---|---|---|---|---|---|---|
i64 | i64 | i64 | str | str | f64 | i64 | i64 | str | f64 | str | str |
1 | 0 | 3 | "Braund, Mr. Owen Harris" | "male" | 22.0 | 1 | 0 | "A/5 21171" | 7.25 | null | "S" |
2 | 1 | 1 | "Cumings, Mrs. John Bradley (Fl… | "female" | 38.0 | 1 | 0 | "PC 17599" | 71.2833 | "C85" | "C" |
3 | 1 | 3 | "Heikkinen, Miss. Laina" | "female" | 26.0 | 0 | 0 | "STON/O2. 3101282" | 7.925 | null | "S" |
4 | 1 | 1 | "Futrelle, Mrs. Jacques Heath (… | "female" | 35.0 | 1 | 0 | "113803" | 53.1 | "C123" | "S" |
5 | 0 | 3 | "Allen, Mr. William Henry" | "male" | 35.0 | 0 | 0 | "373450" | 8.05 | null | "S" |
# map 'female' to 0 and 'male' to 1
train = train.with_columns(Sex_num=pl.col("Sex").replace({"female": 0, "male": 1}))
train[0:4, ["Sex", "Sex_num"]]
shape: (4, 2)
Sex | Sex_num |
---|---|
str | str |
"male" | "1" |
"female" | "0" |
"female" | "0" |
"female" | "0" |
# calculate the length of each string in the 'Name' Series
train = train.with_columns(Name_length=pl.col("Name").len())
train[0:4, ["Name", "Name_length"]]
shape: (4, 2)
Name | Name_length |
---|---|
str | u32 |
"Braund, Mr. Owen Harris" | 891 |
"Cumings, Mrs. John Bradley (Fl… | 891 |
"Heikkinen, Miss. Laina" | 891 |
"Futrelle, Mrs. Jacques Heath (… | 891 |
# use a string method to split the 'Name' Series at commas (returns a Series of lists)
train["Name"].str.split(",").head()
shape: (10,)
Name |
---|
list[str] |
["Braund", " Mr. Owen Harris"] |
["Cumings", " Mrs. John Bradley (Florence Briggs Thayer)"] |
["Heikkinen", " Miss. Laina"] |
["Futrelle", " Mrs. Jacques Heath (Lily May Peel)"] |
["Allen", " Mr. William Henry"] |
["Moran", " Mr. James"] |
["McCarthy", " Mr. Timothy J"] |
["Palsson", " Master. Gosta Leonard"] |
["Johnson", " Mrs. Oscar W (Elisabeth Vilhelmina Berg)"] |
["Nasser", " Mrs. Nicholas (Adele Achem)"] |
# use a lambda function
train["Name"].str.split(",").map_elements(lambda x: x[0], return_dtype=pl.String).head()
shape: (10,)
Name |
---|
str |
"Braund" |
"Cumings" |
"Heikkinen" |
"Futrelle" |
"Allen" |
"Moran" |
"McCarthy" |
"Palsson" |
"Johnson" |
"Nasser" |
Pivot#
stocks = pl.read_csv("data/stocks.csv")
stocks
shape: (9, 4)
Date | Close | Volume | Symbol |
---|---|---|---|
str | f64 | i64 | str |
"2016-10-03" | 31.5 | 14070500 | "CSCO" |
"2016-10-03" | 112.52 | 21701800 | "AAPL" |
"2016-10-03" | 57.42 | 19189500 | "MSFT" |
"2016-10-04" | 113.0 | 29736800 | "AAPL" |
"2016-10-04" | 57.24 | 20085900 | "MSFT" |
"2016-10-04" | 31.35 | 18460400 | "CSCO" |
"2016-10-05" | 57.64 | 16726400 | "MSFT" |
"2016-10-05" | 31.59 | 11808600 | "CSCO" |
"2016-10-05" | 113.05 | 21453100 | "AAPL" |
stocks = stocks.pivot(values="Close", index="Symbol", on="Date")
stocks
shape: (3, 4)
Symbol | 2016-10-03 | 2016-10-04 | 2016-10-05 |
---|---|---|---|
str | f64 | f64 | f64 |
"CSCO" | 31.5 | 31.35 | 31.59 |
"AAPL" | 112.52 | 113.0 | 113.05 |
"MSFT" | 57.42 | 57.24 | 57.64 |
Clean#
Remove duplicate rows#
user_cols = ["user_id", "age", "gender", "occupation", "zip_code"]
users = pl.read_csv("data/movie.user", separator="|", has_header=False)
users.columns = user_cols
users.head()
shape: (5, 5)
user_id | age | gender | occupation | zip_code |
---|---|---|---|---|
i64 | i64 | str | str | str |
1 | 24 | "M" | "technician" | "85711" |
2 | 53 | "F" | "other" | "94043" |
3 | 23 | "M" | "writer" | "32067" |
4 | 24 | "M" | "technician" | "43537" |
5 | 33 | "F" | "other" | "15213" |
users.shape
(943, 5)
# count the duplicate items (True becomes 1, False becomes 0)
users["zip_code"].is_duplicated().sum()
250
Detect missing values#
ufo = pl.read_csv("data/ufo.csv")
ufo.tail()
shape: (5, 5)
City | Colors Reported | Shape Reported | State | Time |
---|---|---|---|---|
str | str | str | str | str |
"Grant Park" | null | "TRIANGLE" | "IL" | "12/31/2000 23:00" |
"Spirit Lake" | null | "DISK" | "IA" | "12/31/2000 23:00" |
"Eagle River" | null | null | "WI" | "12/31/2000 23:45" |
"Eagle River" | "RED" | "LIGHT" | "WI" | "12/31/2000 23:45" |
"Ybor" | null | "OVAL" | "FL" | "12/31/2000 23:59" |
ufo.select(pl.all().is_null())
shape: (18_241, 5)
City | Colors Reported | Shape Reported | State | Time |
---|---|---|---|---|
bool | bool | bool | bool | bool |
false | true | false | false | false |
false | true | false | false | false |
false | true | false | false | false |
false | true | false | false | false |
false | true | false | false | false |
… | … | … | … | … |
false | true | false | false | false |
false | true | false | false | false |
false | true | true | false | false |
false | false | false | false | false |
false | true | false | false | false |
ufo.select(pl.all().is_not_null())
shape: (18_241, 5)
City | Colors Reported | Shape Reported | State | Time |
---|---|---|---|---|
bool | bool | bool | bool | bool |
true | false | true | true | true |
true | false | true | true | true |
true | false | true | true | true |
true | false | true | true | true |
true | false | true | true | true |
… | … | … | … | … |
true | false | true | true | true |
true | false | true | true | true |
true | false | false | true | true |
true | true | true | true | true |
true | false | true | true | true |
# count the number of missing values in each Series
ufo.select(ufo.select(pl.all().is_null()).sum())
shape: (1, 5)
City | Colors Reported | Shape Reported | State | Time |
---|---|---|---|---|
u32 | u32 | u32 | u32 | u32 |
25 | 15359 | 2644 | 0 | 0 |
# if 'any' values are missing in a row, then drop that row
ufo.drop_nulls().shape
(2486, 5)
# if 'all' values are missing in a row, then drop that row (none are dropped in this case)
ufo.filter(~pl.all_horizontal(pl.all().is_null())).shape
(18241, 5)
# if 'any' values are missing in a row (considering only 'City' and 'Shape Reported'), then drop that row
ufo.filter(~pl.all_horizontal(pl.col(["City", "Shape Reported"]).is_null())).shape
(18237, 5)
# 'value_counts' does not include missing values by default
ufo["Shape Reported"].value_counts()
shape: (28, 2)
Shape Reported | count |
---|---|
str | u32 |
"CONE" | 60 |
"FLASH" | 188 |
"SPHERE" | 1054 |
"CROSS" | 36 |
"EGG" | 197 |
… | … |
"VARIOUS" | 333 |
"FLARE" | 1 |
null | 2644 |
"TEARDROP" | 119 |
"CRESCENT" | 2 |
Fill missing values#
# fill in missing values with a specified value
ufo.with_columns([
pl.col("Shape Reported").fill_null(value="VARIOUS"),
pl.col("Colors Reported").fill_null(value="UNKNOWN"),
])
shape: (18_241, 5)
City | Colors Reported | Shape Reported | State | Time |
---|---|---|---|---|
str | str | str | str | str |
"Ithaca" | "UNKNOWN" | "TRIANGLE" | "NY" | "6/1/1930 22:00" |
"Willingboro" | "UNKNOWN" | "OTHER" | "NJ" | "6/30/1930 20:00" |
"Holyoke" | "UNKNOWN" | "OVAL" | "CO" | "2/15/1931 14:00" |
"Abilene" | "UNKNOWN" | "DISK" | "KS" | "6/1/1931 13:00" |
"New York Worlds Fair" | "UNKNOWN" | "LIGHT" | "NY" | "4/18/1933 19:00" |
… | … | … | … | … |
"Grant Park" | "UNKNOWN" | "TRIANGLE" | "IL" | "12/31/2000 23:00" |
"Spirit Lake" | "UNKNOWN" | "DISK" | "IA" | "12/31/2000 23:00" |
"Eagle River" | "UNKNOWN" | "VARIOUS" | "WI" | "12/31/2000 23:45" |
"Eagle River" | "RED" | "LIGHT" | "WI" | "12/31/2000 23:45" |
"Ybor" | "UNKNOWN" | "OVAL" | "FL" | "12/31/2000 23:59" |
# confirm that the missing values were filled in
ufo["Shape Reported"].value_counts().head()
shape: (5, 2)
Shape Reported | count |
---|---|
str | u32 |
"CONE" | 60 |
"CIGAR" | 617 |
"OTHER" | 1402 |
"CRESCENT" | 2 |
"LIGHT" | 2803 |
# fill missing values using "backward fill" strategy (doesn't affect the DataFrame since inplace=False)
ufo.fill_null(strategy="forward").tail()
shape: (5, 5)
City | Colors Reported | Shape Reported | State | Time |
---|---|---|---|---|
str | str | str | str | str |
"Grant Park" | "RED" | "TRIANGLE" | "IL" | "12/31/2000 23:00" |
"Spirit Lake" | "RED" | "DISK" | "IA" | "12/31/2000 23:00" |
"Eagle River" | "RED" | "DISK" | "WI" | "12/31/2000 23:45" |
"Eagle River" | "RED" | "LIGHT" | "WI" | "12/31/2000 23:45" |
"Ybor" | "RED" | "OVAL" | "FL" | "12/31/2000 23:59" |
# compare with "forward fill" strategy (doesn't affect the DataFrame since inplace=False)
ufo.fill_null(strategy="backward").tail()
shape: (5, 5)
City | Colors Reported | Shape Reported | State | Time |
---|---|---|---|---|
str | str | str | str | str |
"Grant Park" | "RED" | "TRIANGLE" | "IL" | "12/31/2000 23:00" |
"Spirit Lake" | "RED" | "DISK" | "IA" | "12/31/2000 23:00" |
"Eagle River" | "RED" | "LIGHT" | "WI" | "12/31/2000 23:45" |
"Eagle River" | "RED" | "LIGHT" | "WI" | "12/31/2000 23:45" |
"Ybor" | null | "OVAL" | "FL" | "12/31/2000 23:59" |
Outliers (clip)#
drinks["wine_servings"].clip(10, 300)
shape: (193,)
wine_servings |
---|
i64 |
10 |
54 |
14 |
300 |
45 |
… |
10 |
10 |
10 |
10 |
10 |
String & Time#
drinks = pl.read_csv("data/drinks.csv")
drinks.head()
shape: (5, 6)
country | beer_servings | spirit_servings | wine_servings | total_litres_of_pure_alcohol | continent |
---|---|---|---|---|---|
str | i64 | i64 | i64 | f64 | str |
"Afghanistan" | 0 | 0 | 0 | 0.0 | "Asia" |
"Albania" | 89 | 132 | 54 | 4.9 | "Europe" |
"Algeria" | 25 | 0 | 14 | 0.7 | "Africa" |
"Andorra" | 245 | 138 | 312 | 12.4 | "Europe" |
"Angola" | 217 | 57 | 45 | 5.9 | "Africa" |
Use string methods#
orders = pl.read_csv("data/chipotle.tsv", separator="\t")
orders.head()
shape: (5, 5)
order_id | quantity | item_name | choice_description | item_price |
---|---|---|---|---|
i64 | i64 | str | str | str |
1 | 1 | "Chips and Fresh Tomato Salsa" | "NULL" | "$2.39" |
1 | 1 | "Izze" | "[Clementine]" | "$3.39" |
1 | 1 | "Nantucket Nectar" | "[Apple]" | "$3.39" |
1 | 1 | "Chips and Tomatillo-Green Chil… | "NULL" | "$2.39" |
2 | 2 | "Chicken Bowl" | "[Tomatillo-Red Chili Salsa (Ho… | "$16.98" |
# string methods for pandas Series are accessed via 'str'
orders["item_name"].str.to_uppercase().head()
shape: (10,)
item_name |
---|
str |
"CHIPS AND FRESH TOMATO SALSA" |
"IZZE" |
"NANTUCKET NECTAR" |
"CHIPS AND TOMATILLO-GREEN CHIL… |
"CHICKEN BOWL" |
"CHICKEN BOWL" |
"SIDE OF CHIPS" |
"STEAK BURRITO" |
"STEAK SOFT TACOS" |
"STEAK BURRITO" |
# use the boolean Series to filter the DataFrame
orders.filter(pl.col("item_name").str.contains("Chicken")).head()
shape: (5, 5)
order_id | quantity | item_name | choice_description | item_price |
---|---|---|---|---|
i64 | i64 | str | str | str |
2 | 2 | "Chicken Bowl" | "[Tomatillo-Red Chili Salsa (Ho… | "$16.98" |
3 | 1 | "Chicken Bowl" | "[Fresh Tomato Salsa (Mild), [R… | "$10.98" |
6 | 1 | "Chicken Crispy Tacos" | "[Roasted Chili Corn Salsa, [Fa… | "$8.75" |
6 | 1 | "Chicken Soft Tacos" | "[Roasted Chili Corn Salsa, [Ri… | "$8.75" |
7 | 1 | "Chicken Bowl" | "[Fresh Tomato Salsa, [Fajita V… | "$11.25" |
# many pandas string methods support regular expressions (regex)
orders["choice_description"].str.replace(r"[\[\]]", "").head()
shape: (10,)
choice_description |
---|
str |
"NULL" |
"Clementine]" |
"Apple]" |
"NULL" |
"Tomatillo-Red Chili Salsa (Hot… |
"Fresh Tomato Salsa (Mild), [Ri… |
"NULL" |
"Tomatillo Red Chili Salsa, [Fa… |
"Tomatillo Green Chili Salsa, [… |
"Fresh Tomato Salsa, [Rice, Bla… |
Dates & Times#
# read a dataset of UFO reports into a DataFrame
ufo = pl.read_csv("data/ufo.csv")
ufo.head()
shape: (5, 5)
City | Colors Reported | Shape Reported | State | Time |
---|---|---|---|---|
str | str | str | str | str |
"Ithaca" | null | "TRIANGLE" | "NY" | "6/1/1930 22:00" |
"Willingboro" | null | "OTHER" | "NJ" | "6/30/1930 20:00" |
"Holyoke" | null | "OVAL" | "CO" | "2/15/1931 14:00" |
"Abilene" | null | "DISK" | "KS" | "6/1/1931 13:00" |
"New York Worlds Fair" | null | "LIGHT" | "NY" | "4/18/1933 19:00" |
# 'Time' is currently stored as a string
ufo.dtypes
[String, String, String, String, String]
Category data#
drinks = pl.read_csv("data/drinks.csv")
drinks.head()
shape: (5, 6)
country | beer_servings | spirit_servings | wine_servings | total_litres_of_pure_alcohol | continent |
---|---|---|---|---|---|
str | i64 | i64 | i64 | f64 | str |
"Afghanistan" | 0 | 0 | 0 | 0.0 | "Asia" |
"Albania" | 89 | 132 | 54 | 4.9 | "Europe" |
"Algeria" | 25 | 0 | 14 | 0.7 | "Africa" |
"Andorra" | 245 | 138 | 312 | 12.4 | "Europe" |
"Angola" | 217 | 57 | 45 | 5.9 | "Africa" |
# use the 'category' data type to store the 'continent' strings as integers
drinks = drinks.with_columns(pl.col("continent").cast(pl.Categorical))
drinks.dtypes
[String, Int64, Int64, Int64, Float64, Categorical(ordering='physical')]
# strings are now encoded (0 means 'Africa', 1 means 'Asia', 2 means 'Europe', etc.)
drinks["continent"].to_physical().head()
shape: (10,)
continent |
---|
u32 |
0 |
1 |
2 |
1 |
2 |
3 |
4 |
1 |
5 |
1 |
# repeat this process for the 'country' Series
drinks = drinks.with_columns(pl.col("country").cast(pl.Categorical))
# memory usage increased because we created 193 categories
drinks["country"].cat.get_categories()
shape: (193,)
country |
---|
str |
"Afghanistan" |
"Albania" |
"Algeria" |
"Andorra" |
"Angola" |
… |
"Venezuela" |
"Vietnam" |
"Yemen" |
"Zambia" |
"Zimbabwe" |
Dummy variables#
train = pl.read_csv("data/titanic_train.csv")
train.head()
shape: (5, 12)
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked |
---|---|---|---|---|---|---|---|---|---|---|---|
i64 | i64 | i64 | str | str | f64 | i64 | i64 | str | f64 | str | str |
1 | 0 | 3 | "Braund, Mr. Owen Harris" | "male" | 22.0 | 1 | 0 | "A/5 21171" | 7.25 | null | "S" |
2 | 1 | 1 | "Cumings, Mrs. John Bradley (Fl… | "female" | 38.0 | 1 | 0 | "PC 17599" | 71.2833 | "C85" | "C" |
3 | 1 | 3 | "Heikkinen, Miss. Laina" | "female" | 26.0 | 0 | 0 | "STON/O2. 3101282" | 7.925 | null | "S" |
4 | 1 | 1 | "Futrelle, Mrs. Jacques Heath (… | "female" | 35.0 | 1 | 0 | "113803" | 53.1 | "C123" | "S" |
5 | 0 | 3 | "Allen, Mr. William Henry" | "male" | 35.0 | 0 | 0 | "373450" | 8.05 | null | "S" |
# use 'get_dummies' to create one column for every possible value
train.to_dummies(["Sex"]).head()
shape: (5, 13)
PassengerId | Survived | Pclass | Name | Sex_female | Sex_male | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked |
---|---|---|---|---|---|---|---|---|---|---|---|---|
i64 | i64 | i64 | str | u8 | u8 | f64 | i64 | i64 | str | f64 | str | str |
1 | 0 | 3 | "Braund, Mr. Owen Harris" | 0 | 1 | 22.0 | 1 | 0 | "A/5 21171" | 7.25 | null | "S" |
2 | 1 | 1 | "Cumings, Mrs. John Bradley (Fl… | 1 | 0 | 38.0 | 1 | 0 | "PC 17599" | 71.2833 | "C85" | "C" |
3 | 1 | 3 | "Heikkinen, Miss. Laina" | 1 | 0 | 26.0 | 0 | 0 | "STON/O2. 3101282" | 7.925 | null | "S" |
4 | 1 | 1 | "Futrelle, Mrs. Jacques Heath (… | 1 | 0 | 35.0 | 1 | 0 | "113803" | 53.1 | "C123" | "S" |
5 | 0 | 3 | "Allen, Mr. William Henry" | 0 | 1 | 35.0 | 0 | 0 | "373450" | 8.05 | null | "S" |
# drop the first dummy variable ('female') using the 'iloc' method
# add a prefix to identify the source of the dummy variables
train.to_dummies(["Sex"], separator=":")[:, 1:].head()
shape: (5, 12)
Survived | Pclass | Name | Sex:female | Sex:male | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked |
---|---|---|---|---|---|---|---|---|---|---|---|
i64 | i64 | str | u8 | u8 | f64 | i64 | i64 | str | f64 | str | str |
0 | 3 | "Braund, Mr. Owen Harris" | 0 | 1 | 22.0 | 1 | 0 | "A/5 21171" | 7.25 | null | "S" |
1 | 1 | "Cumings, Mrs. John Bradley (Fl… | 1 | 0 | 38.0 | 1 | 0 | "PC 17599" | 71.2833 | "C85" | "C" |
1 | 3 | "Heikkinen, Miss. Laina" | 1 | 0 | 26.0 | 0 | 0 | "STON/O2. 3101282" | 7.925 | null | "S" |
1 | 1 | "Futrelle, Mrs. Jacques Heath (… | 1 | 0 | 35.0 | 1 | 0 | "113803" | 53.1 | "C123" | "S" |
0 | 3 | "Allen, Mr. William Henry" | 0 | 1 | 35.0 | 0 | 0 | "373450" | 8.05 | null | "S" |
# use 'get_dummies' with a feature that has 3 possible values
train.to_dummies(["Embarked"], separator=":").head(10)
shape: (10, 15)
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked:C | Embarked:Q | Embarked:S | Embarked:null |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
i64 | i64 | i64 | str | str | f64 | i64 | i64 | str | f64 | str | u8 | u8 | u8 | u8 |
1 | 0 | 3 | "Braund, Mr. Owen Harris" | "male" | 22.0 | 1 | 0 | "A/5 21171" | 7.25 | null | 0 | 0 | 1 | 0 |
2 | 1 | 1 | "Cumings, Mrs. John Bradley (Fl… | "female" | 38.0 | 1 | 0 | "PC 17599" | 71.2833 | "C85" | 1 | 0 | 0 | 0 |
3 | 1 | 3 | "Heikkinen, Miss. Laina" | "female" | 26.0 | 0 | 0 | "STON/O2. 3101282" | 7.925 | null | 0 | 0 | 1 | 0 |
4 | 1 | 1 | "Futrelle, Mrs. Jacques Heath (… | "female" | 35.0 | 1 | 0 | "113803" | 53.1 | "C123" | 0 | 0 | 1 | 0 |
5 | 0 | 3 | "Allen, Mr. William Henry" | "male" | 35.0 | 0 | 0 | "373450" | 8.05 | null | 0 | 0 | 1 | 0 |
6 | 0 | 3 | "Moran, Mr. James" | "male" | null | 0 | 0 | "330877" | 8.4583 | null | 0 | 1 | 0 | 0 |
7 | 0 | 1 | "McCarthy, Mr. Timothy J" | "male" | 54.0 | 0 | 0 | "17463" | 51.8625 | "E46" | 0 | 0 | 1 | 0 |
8 | 0 | 3 | "Palsson, Master. Gosta Leonard" | "male" | 2.0 | 3 | 1 | "349909" | 21.075 | null | 0 | 0 | 1 | 0 |
9 | 1 | 3 | "Johnson, Mrs. Oscar W (Elisabe… | "female" | 27.0 | 0 | 2 | "347742" | 11.1333 | null | 0 | 0 | 1 | 0 |
10 | 1 | 2 | "Nasser, Mrs. Nicholas (Adele A… | "female" | 14.0 | 1 | 0 | "237736" | 30.0708 | null | 1 | 0 | 0 | 0 |
# pass the DataFrame to 'get_dummies' and specify which columns to dummy (it drops the original columns)
train.to_dummies(["Sex", "Embarked"]).head()
shape: (5, 16)
PassengerId | Survived | Pclass | Name | Sex_female | Sex_male | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked_C | Embarked_Q | Embarked_S | Embarked_null |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
i64 | i64 | i64 | str | u8 | u8 | f64 | i64 | i64 | str | f64 | str | u8 | u8 | u8 | u8 |
1 | 0 | 3 | "Braund, Mr. Owen Harris" | 0 | 1 | 22.0 | 1 | 0 | "A/5 21171" | 7.25 | null | 0 | 0 | 1 | 0 |
2 | 1 | 1 | "Cumings, Mrs. John Bradley (Fl… | 1 | 0 | 38.0 | 1 | 0 | "PC 17599" | 71.2833 | "C85" | 1 | 0 | 0 | 0 |
3 | 1 | 3 | "Heikkinen, Miss. Laina" | 1 | 0 | 26.0 | 0 | 0 | "STON/O2. 3101282" | 7.925 | null | 0 | 0 | 1 | 0 |
4 | 1 | 1 | "Futrelle, Mrs. Jacques Heath (… | 1 | 0 | 35.0 | 1 | 0 | "113803" | 53.1 | "C123" | 0 | 0 | 1 | 0 |
5 | 0 | 3 | "Allen, Mr. William Henry" | 0 | 1 | 35.0 | 0 | 0 | "373450" | 8.05 | null | 0 | 0 | 1 | 0 |
# use the 'drop_first' parameter (new in pandas 0.18) to drop the first dummy variable for each feature
train.to_dummies(["Sex", "Embarked"], drop_first=True).head()
shape: (5, 14)
PassengerId | Survived | Pclass | Name | Sex_female | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked_C | Embarked_Q | Embarked_null |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
i64 | i64 | i64 | str | u8 | f64 | i64 | i64 | str | f64 | str | u8 | u8 | u8 |
1 | 0 | 3 | "Braund, Mr. Owen Harris" | 0 | 22.0 | 1 | 0 | "A/5 21171" | 7.25 | null | 0 | 0 | 0 |
2 | 1 | 1 | "Cumings, Mrs. John Bradley (Fl… | 1 | 38.0 | 1 | 0 | "PC 17599" | 71.2833 | "C85" | 1 | 0 | 0 |
3 | 1 | 3 | "Heikkinen, Miss. Laina" | 1 | 26.0 | 0 | 0 | "STON/O2. 3101282" | 7.925 | null | 0 | 0 | 0 |
4 | 1 | 1 | "Futrelle, Mrs. Jacques Heath (… | 1 | 35.0 | 1 | 0 | "113803" | 53.1 | "C123" | 0 | 0 | 0 |
5 | 0 | 3 | "Allen, Mr. William Henry" | 0 | 35.0 | 0 | 0 | "373450" | 8.05 | null | 0 | 0 | 0 |
Cardinal data#
# create a small DataFrame from a dictionary
small = pl.DataFrame({
"ID": [100, 101, 102, 103],
"quality": ["good", "good", "very good", "excellent"],
})
small.head()
shape: (4, 2)
ID | quality |
---|---|
i64 | str |
100 | "good" |
101 | "good" |
102 | "very good" |
103 | "excellent" |
# define a logical ordering for the categories
small.with_columns([pl.col("quality").cast(pl.Categorical("lexical"))])
small["quality"]
shape: (4,)
quality |
---|
str |
"good" |
"good" |
"very good" |
"excellent" |
# sort the DataFrame by the 'quality' Series (logical order)
small.sort(by="quality")
shape: (4, 2)
ID | quality |
---|---|
i64 | str |
103 | "excellent" |
100 | "good" |
101 | "good" |
102 | "very good" |
# comparison operators work with ordered categories
small.filter(pl.col("quality") > "good")
shape: (1, 2)
ID | quality |
---|---|
i64 | str |
102 | "very good" |
Sample rows#
# use the 'seed' parameter for reproducibility
ufo.sample(n=3, seed=42)
shape: (3, 5)
City | Colors Reported | Shape Reported | State | Time |
---|---|---|---|---|
str | str | str | str | str |
"Kennett Square" | null | "OTHER" | "PA" | "6/1/1975 22:00" |
"Landaff" | null | "TRIANGLE" | "NH" | "11/1/1986 22:50" |
"Mercedes" | null | "OVAL" | "TX" | "2/14/1986 8:15" |
# sample 75% of the DataFrame's rows without replacement
train = ufo.sample(fraction=0.75, seed=99)