清洗和预处理#
import polars as pl
Explore#
drinks = pl.read_csv("data/drinks.csv")
drinks.head()
shape: (5, 6)
| country | beer_servings | spirit_servings | wine_servings | total_litres_of_pure_alcohol | continent |
|---|---|---|---|---|---|
| str | i64 | i64 | i64 | f64 | str |
| "Afghanistan" | 0 | 0 | 0 | 0.0 | "Asia" |
| "Albania" | 89 | 132 | 54 | 4.9 | "Europe" |
| "Algeria" | 25 | 0 | 14 | 0.7 | "Africa" |
| "Andorra" | 245 | 138 | 312 | 12.4 | "Europe" |
| "Angola" | 217 | 57 | 45 | 5.9 | "Africa" |
Max & Min#
drinks.top_k(10, by="beer_servings")
shape: (10, 6)
| country | beer_servings | spirit_servings | wine_servings | total_litres_of_pure_alcohol | continent |
|---|---|---|---|---|---|
| str | i64 | i64 | i64 | f64 | str |
| "Namibia" | 376 | 3 | 1 | 6.8 | "Africa" |
| "Czech Republic" | 361 | 170 | 134 | 11.8 | "Europe" |
| "Gabon" | 347 | 98 | 59 | 8.9 | "Africa" |
| "Germany" | 346 | 117 | 175 | 11.3 | "Europe" |
| "Poland" | 343 | 215 | 56 | 10.9 | "Europe" |
| "Lithuania" | 343 | 244 | 56 | 12.9 | "Europe" |
| "Venezuela" | 333 | 100 | 3 | 7.7 | "South America" |
| "Ireland" | 313 | 118 | 165 | 11.4 | "Europe" |
| "Palau" | 306 | 63 | 23 | 6.9 | "Oceania" |
| "Romania" | 297 | 122 | 167 | 10.4 | "Europe" |
drinks["beer_servings"].top_k(10)
shape: (10,)
| beer_servings |
|---|
| i64 |
| 343 |
| 361 |
| 347 |
| 346 |
| 343 |
| 376 |
| 333 |
| 313 |
| 306 |
| 297 |
Count#
drinks["continent"].value_counts()
shape: (6, 2)
| continent | count |
|---|---|
| str | u32 |
| "Europe" | 45 |
| "Asia" | 44 |
| "Africa" | 53 |
| "Oceania" | 16 |
| "South America" | 12 |
| "North America" | 23 |
drinks["continent"].value_counts(normalize=True)
shape: (6, 2)
| continent | proportion |
|---|---|
| str | f64 |
| "Europe" | 0.233161 |
| "South America" | 0.062176 |
| "North America" | 0.119171 |
| "Asia" | 0.227979 |
| "Oceania" | 0.082902 |
| "Africa" | 0.274611 |
drinks["continent"].value_counts().columns
['continent', 'count']
# any Series can be sorted by its values
drinks["continent"].value_counts().sort(by="count")
shape: (6, 2)
| continent | count |
|---|---|
| str | u32 |
| "South America" | 12 |
| "Oceania" | 16 |
| "North America" | 23 |
| "Asia" | 44 |
| "Europe" | 45 |
| "Africa" | 53 |
drinks["continent"].unique()
shape: (6,)
| continent |
|---|
| str |
| "Asia" |
| "North America" |
| "Africa" |
| "Oceania" |
| "South America" |
| "Europe" |
drinks["continent"].unique_counts()
shape: (6,)
| continent |
|---|
| u32 |
| 44 |
| 45 |
| 53 |
| 23 |
| 12 |
| 16 |
Sort#
movies = pl.read_csv("data/imdb_1000.csv")
movies.head()
shape: (5, 6)
| star_rating | title | content_rating | genre | duration | actors_list |
|---|---|---|---|---|---|
| f64 | str | str | str | i64 | str |
| 9.3 | "The Shawshank Redemption" | "R" | "Crime" | 142 | "[u'Tim Robbins', u'Morgan Free… |
| 9.2 | "The Godfather" | "R" | "Crime" | 175 | "[u'Marlon Brando', u'Al Pacino… |
| 9.1 | "The Godfather: Part II" | "R" | "Crime" | 200 | "[u'Al Pacino', u'Robert De Nir… |
| 9.0 | "The Dark Knight" | "PG-13" | "Action" | 152 | "[u'Christian Bale', u'Heath Le… |
| 8.9 | "Pulp Fiction" | "R" | "Crime" | 154 | "[u'John Travolta', u'Uma Thurm… |
# sort the 'title' Series in ascending order (returns a Series)
movies["title"].sort().head()
shape: (10,)
| title |
|---|
| str |
| "(500) Days of Summer" |
| "12 Angry Men" |
| "12 Years a Slave" |
| "127 Hours" |
| "2001: A Space Odyssey" |
| "2046" |
| "21 Grams" |
| "25th Hour" |
| "28 Days Later..." |
| "3 Idiots" |
# sort in descending order instead
movies["title"].sort(descending=True).head()
shape: (10,)
| title |
|---|
| str |
| "[Rec]" |
| "Zulu" |
| "Zombieland" |
| "Zodiac" |
| "Zero Dark Thirty" |
| "Zelig" |
| "Young Frankenstein" |
| "Yojimbo" |
| "Yip Man" |
| "Ying xiong" |
# sort the entire DataFrame by the 'title' Series (returns a DataFrame)
movies.sort(by="title").head()
shape: (5, 6)
| star_rating | title | content_rating | genre | duration | actors_list |
|---|---|---|---|---|---|
| f64 | str | str | str | i64 | str |
| 7.8 | "(500) Days of Summer" | "PG-13" | "Comedy" | 95 | "[u'Zooey Deschanel', u'Joseph … |
| 8.9 | "12 Angry Men" | "NOT RATED" | "Drama" | 96 | "[u'Henry Fonda', u'Lee J. Cobb… |
| 8.1 | "12 Years a Slave" | "R" | "Biography" | 134 | "[u'Chiwetel Ejiofor', u'Michae… |
| 7.6 | "127 Hours" | "R" | "Adventure" | 94 | "[u'James Franco', u'Amber Tamb… |
| 8.3 | "2001: A Space Odyssey" | "G" | "Mystery" | 160 | "[u'Keir Dullea', u'Gary Lockwo… |
# sort in descending order instead
movies.sort(by="title", descending=False).head()
shape: (5, 6)
| star_rating | title | content_rating | genre | duration | actors_list |
|---|---|---|---|---|---|
| f64 | str | str | str | i64 | str |
| 7.8 | "(500) Days of Summer" | "PG-13" | "Comedy" | 95 | "[u'Zooey Deschanel', u'Joseph … |
| 8.9 | "12 Angry Men" | "NOT RATED" | "Drama" | 96 | "[u'Henry Fonda', u'Lee J. Cobb… |
| 8.1 | "12 Years a Slave" | "R" | "Biography" | 134 | "[u'Chiwetel Ejiofor', u'Michae… |
| 7.6 | "127 Hours" | "R" | "Adventure" | 94 | "[u'James Franco', u'Amber Tamb… |
| 8.3 | "2001: A Space Odyssey" | "G" | "Mystery" | 160 | "[u'Keir Dullea', u'Gary Lockwo… |
# sort the DataFrame first by 'content_rating', then by 'duration'
movies.sort(by=["content_rating", "duration"]).head()
shape: (5, 6)
| star_rating | title | content_rating | genre | duration | actors_list |
|---|---|---|---|---|---|
| f64 | str | str | str | i64 | str |
| 8.2 | "Butch Cassidy and the Sundance… | null | "Biography" | 110 | "[u'Paul Newman', u'Robert Redf… |
| 7.4 | "True Grit" | null | "Adventure" | 128 | "[u'John Wayne', u'Kim Darby', … |
| 7.7 | "Where Eagles Dare" | null | "Action" | 158 | "[u'Richard Burton', u'Clint Ea… |
| 7.6 | "The Jungle Book" | "APPROVED" | "Animation" | 78 | "[u'Phil Harris', u'Sebastian C… |
| 7.8 | "Invasion of the Body Snatchers" | "APPROVED" | "Horror" | 80 | "[u'Kevin McCarthy', u'Dana Wyn… |
Transform#
Group#
drinks = pl.read_csv("data/drinks.csv")
drinks.head()
shape: (5, 6)
| country | beer_servings | spirit_servings | wine_servings | total_litres_of_pure_alcohol | continent |
|---|---|---|---|---|---|
| str | i64 | i64 | i64 | f64 | str |
| "Afghanistan" | 0 | 0 | 0 | 0.0 | "Asia" |
| "Albania" | 89 | 132 | 54 | 4.9 | "Europe" |
| "Algeria" | 25 | 0 | 14 | 0.7 | "Africa" |
| "Andorra" | 245 | 138 | 312 | 12.4 | "Europe" |
| "Angola" | 217 | 57 | 45 | 5.9 | "Africa" |
# calculate the mean beer servings for each continent
drinks.group_by("continent").mean()["beer_servings"]
shape: (6,)
| beer_servings |
|---|
| f64 |
| 61.471698 |
| 89.6875 |
| 175.083333 |
| 193.777778 |
| 145.434783 |
| 37.045455 |
# other aggregation functions (such as 'max') can also be used with groupby
drinks.group_by("continent").max()["beer_servings"]
shape: (6,)
| beer_servings |
|---|
| i64 |
| 306 |
| 361 |
| 285 |
| 247 |
| 333 |
| 376 |
# multiple aggregation functions can be applied simultaneously
drinks.group_by("continent").agg(
pl.col("beer_servings").count().name.suffix("_count"),
pl.col("beer_servings").mean().name.suffix("_mean"),
pl.col("beer_servings").min().name.suffix("_min"),
pl.col("beer_servings").max().name.suffix("_max"),
)
shape: (6, 5)
| continent | beer_servings_count | beer_servings_mean | beer_servings_min | beer_servings_max |
|---|---|---|---|---|
| str | u32 | f64 | i64 | i64 |
| "South America" | 12 | 175.083333 | 93 | 333 |
| "Asia" | 44 | 37.045455 | 0 | 247 |
| "Oceania" | 16 | 89.6875 | 0 | 306 |
| "Africa" | 53 | 61.471698 | 0 | 376 |
| "North America" | 23 | 145.434783 | 1 | 285 |
| "Europe" | 45 | 193.777778 | 0 | 361 |
Apply#
drinks.group_by("continent").mean()
shape: (6, 6)
| continent | country | beer_servings | spirit_servings | wine_servings | total_litres_of_pure_alcohol |
|---|---|---|---|---|---|
| str | str | f64 | f64 | f64 | f64 |
| "Asia" | null | 37.045455 | 60.840909 | 9.068182 | 2.170455 |
| "Europe" | null | 193.777778 | 132.555556 | 142.222222 | 8.617778 |
| "Africa" | null | 61.471698 | 16.339623 | 16.264151 | 3.007547 |
| "South America" | null | 175.083333 | 114.75 | 62.416667 | 6.308333 |
| "Oceania" | null | 89.6875 | 58.4375 | 35.625 | 3.38125 |
| "North America" | null | 145.434783 | 165.73913 | 24.521739 | 5.995652 |
iris = pl.read_csv("data/iris.csv")
iris.head()
shape: (5, 5)
| sepal_length | sepal_width | petal_length | petal_width | species |
|---|---|---|---|---|
| f64 | f64 | f64 | f64 | str |
| 5.1 | 3.5 | 1.4 | 0.2 | "setosa" |
| 4.9 | 3.0 | 1.4 | 0.2 | "setosa" |
| 4.7 | 3.2 | 1.3 | 0.2 | "setosa" |
| 4.6 | 3.1 | 1.5 | 0.2 | "setosa" |
| 5.0 | 3.6 | 1.4 | 0.2 | "setosa" |
iris.group_by("species").len()
shape: (3, 2)
| species | len |
|---|---|
| str | u32 |
| "versicolor" | 50 |
| "virginica" | 50 |
| "setosa" | 50 |
iris.group_by("species").agg([
pl.min("sepal_length").name.suffix("_min"),
pl.max("sepal_length").name.suffix("_max"),
pl.mean("petal_length").name.suffix("_mean"),
pl.std("petal_length").name.suffix("_std"),
])
shape: (3, 5)
| species | sepal_length_min | sepal_length_max | petal_length_mean | petal_length_std |
|---|---|---|---|---|
| str | f64 | f64 | f64 | f64 |
| "virginica" | 4.9 | 7.9 | 5.552 | 0.551895 |
| "versicolor" | 4.9 | 7.0 | 4.26 | 0.469911 |
| "setosa" | 4.3 | 5.8 | 1.462 | 0.173664 |
Apply a function#
train = pl.read_csv("data/titanic_train.csv")
train.head()
shape: (5, 12)
| PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked |
|---|---|---|---|---|---|---|---|---|---|---|---|
| i64 | i64 | i64 | str | str | f64 | i64 | i64 | str | f64 | str | str |
| 1 | 0 | 3 | "Braund, Mr. Owen Harris" | "male" | 22.0 | 1 | 0 | "A/5 21171" | 7.25 | null | "S" |
| 2 | 1 | 1 | "Cumings, Mrs. John Bradley (Fl… | "female" | 38.0 | 1 | 0 | "PC 17599" | 71.2833 | "C85" | "C" |
| 3 | 1 | 3 | "Heikkinen, Miss. Laina" | "female" | 26.0 | 0 | 0 | "STON/O2. 3101282" | 7.925 | null | "S" |
| 4 | 1 | 1 | "Futrelle, Mrs. Jacques Heath (… | "female" | 35.0 | 1 | 0 | "113803" | 53.1 | "C123" | "S" |
| 5 | 0 | 3 | "Allen, Mr. William Henry" | "male" | 35.0 | 0 | 0 | "373450" | 8.05 | null | "S" |
# map 'female' to 0 and 'male' to 1
train = train.with_columns(Sex_num=pl.col("Sex").replace({"female": 0, "male": 1}))
train[0:4, ["Sex", "Sex_num"]]
shape: (4, 2)
| Sex | Sex_num |
|---|---|
| str | str |
| "male" | "1" |
| "female" | "0" |
| "female" | "0" |
| "female" | "0" |
# calculate the length of each string in the 'Name' Series
train = train.with_columns(Name_length=pl.col("Name").len())
train[0:4, ["Name", "Name_length"]]
shape: (4, 2)
| Name | Name_length |
|---|---|
| str | u32 |
| "Braund, Mr. Owen Harris" | 891 |
| "Cumings, Mrs. John Bradley (Fl… | 891 |
| "Heikkinen, Miss. Laina" | 891 |
| "Futrelle, Mrs. Jacques Heath (… | 891 |
# use a string method to split the 'Name' Series at commas (returns a Series of lists)
train["Name"].str.split(",").head()
shape: (10,)
| Name |
|---|
| list[str] |
| ["Braund", " Mr. Owen Harris"] |
| ["Cumings", " Mrs. John Bradley (Florence Briggs Thayer)"] |
| ["Heikkinen", " Miss. Laina"] |
| ["Futrelle", " Mrs. Jacques Heath (Lily May Peel)"] |
| ["Allen", " Mr. William Henry"] |
| ["Moran", " Mr. James"] |
| ["McCarthy", " Mr. Timothy J"] |
| ["Palsson", " Master. Gosta Leonard"] |
| ["Johnson", " Mrs. Oscar W (Elisabeth Vilhelmina Berg)"] |
| ["Nasser", " Mrs. Nicholas (Adele Achem)"] |
# use a lambda function
train["Name"].str.split(",").map_elements(lambda x: x[0], return_dtype=pl.String).head()
shape: (10,)
| Name |
|---|
| str |
| "Braund" |
| "Cumings" |
| "Heikkinen" |
| "Futrelle" |
| "Allen" |
| "Moran" |
| "McCarthy" |
| "Palsson" |
| "Johnson" |
| "Nasser" |
Pivot#
stocks = pl.read_csv("data/stocks.csv")
stocks
shape: (9, 4)
| Date | Close | Volume | Symbol |
|---|---|---|---|
| str | f64 | i64 | str |
| "2016-10-03" | 31.5 | 14070500 | "CSCO" |
| "2016-10-03" | 112.52 | 21701800 | "AAPL" |
| "2016-10-03" | 57.42 | 19189500 | "MSFT" |
| "2016-10-04" | 113.0 | 29736800 | "AAPL" |
| "2016-10-04" | 57.24 | 20085900 | "MSFT" |
| "2016-10-04" | 31.35 | 18460400 | "CSCO" |
| "2016-10-05" | 57.64 | 16726400 | "MSFT" |
| "2016-10-05" | 31.59 | 11808600 | "CSCO" |
| "2016-10-05" | 113.05 | 21453100 | "AAPL" |
stocks = stocks.pivot(values="Close", index="Symbol", on="Date")
stocks
shape: (3, 4)
| Symbol | 2016-10-03 | 2016-10-04 | 2016-10-05 |
|---|---|---|---|
| str | f64 | f64 | f64 |
| "CSCO" | 31.5 | 31.35 | 31.59 |
| "AAPL" | 112.52 | 113.0 | 113.05 |
| "MSFT" | 57.42 | 57.24 | 57.64 |
Clean#
Remove duplicate rows#
user_cols = ["user_id", "age", "gender", "occupation", "zip_code"]
users = pl.read_csv("data/movie.user", separator="|", has_header=False)
users.columns = user_cols
users.head()
shape: (5, 5)
| user_id | age | gender | occupation | zip_code |
|---|---|---|---|---|
| i64 | i64 | str | str | str |
| 1 | 24 | "M" | "technician" | "85711" |
| 2 | 53 | "F" | "other" | "94043" |
| 3 | 23 | "M" | "writer" | "32067" |
| 4 | 24 | "M" | "technician" | "43537" |
| 5 | 33 | "F" | "other" | "15213" |
users.shape
(943, 5)
# count the duplicate items (True becomes 1, False becomes 0)
users["zip_code"].is_duplicated().sum()
250
Detect missing values#
ufo = pl.read_csv("data/ufo.csv")
ufo.tail()
shape: (5, 5)
| City | Colors Reported | Shape Reported | State | Time |
|---|---|---|---|---|
| str | str | str | str | str |
| "Grant Park" | null | "TRIANGLE" | "IL" | "12/31/2000 23:00" |
| "Spirit Lake" | null | "DISK" | "IA" | "12/31/2000 23:00" |
| "Eagle River" | null | null | "WI" | "12/31/2000 23:45" |
| "Eagle River" | "RED" | "LIGHT" | "WI" | "12/31/2000 23:45" |
| "Ybor" | null | "OVAL" | "FL" | "12/31/2000 23:59" |
ufo.select(pl.all().is_null())
shape: (18_241, 5)
| City | Colors Reported | Shape Reported | State | Time |
|---|---|---|---|---|
| bool | bool | bool | bool | bool |
| false | true | false | false | false |
| false | true | false | false | false |
| false | true | false | false | false |
| false | true | false | false | false |
| false | true | false | false | false |
| … | … | … | … | … |
| false | true | false | false | false |
| false | true | false | false | false |
| false | true | true | false | false |
| false | false | false | false | false |
| false | true | false | false | false |
ufo.select(pl.all().is_not_null())
shape: (18_241, 5)
| City | Colors Reported | Shape Reported | State | Time |
|---|---|---|---|---|
| bool | bool | bool | bool | bool |
| true | false | true | true | true |
| true | false | true | true | true |
| true | false | true | true | true |
| true | false | true | true | true |
| true | false | true | true | true |
| … | … | … | … | … |
| true | false | true | true | true |
| true | false | true | true | true |
| true | false | false | true | true |
| true | true | true | true | true |
| true | false | true | true | true |
# count the number of missing values in each Series
ufo.select(ufo.select(pl.all().is_null()).sum())
shape: (1, 5)
| City | Colors Reported | Shape Reported | State | Time |
|---|---|---|---|---|
| u32 | u32 | u32 | u32 | u32 |
| 25 | 15359 | 2644 | 0 | 0 |
# if 'any' values are missing in a row, then drop that row
ufo.drop_nulls().shape
(2486, 5)
# if 'all' values are missing in a row, then drop that row (none are dropped in this case)
ufo.filter(~pl.all_horizontal(pl.all().is_null())).shape
(18241, 5)
# if 'any' values are missing in a row (considering only 'City' and 'Shape Reported'), then drop that row
ufo.filter(~pl.all_horizontal(pl.col(["City", "Shape Reported"]).is_null())).shape
(18237, 5)
# 'value_counts' does not include missing values by default
ufo["Shape Reported"].value_counts()
shape: (28, 2)
| Shape Reported | count |
|---|---|
| str | u32 |
| "TEARDROP" | 119 |
| "DISK" | 2122 |
| "ROUND" | 2 |
| "TRIANGLE" | 1889 |
| "CYLINDER" | 294 |
| … | … |
| "CROSS" | 36 |
| "LIGHT" | 2803 |
| "DIAMOND" | 234 |
| "CRESCENT" | 2 |
| "SPHERE" | 1054 |
Fill missing values#
# fill in missing values with a specified value
ufo.with_columns([
pl.col("Shape Reported").fill_null(value="VARIOUS"),
pl.col("Colors Reported").fill_null(value="UNKNOWN"),
])
shape: (18_241, 5)
| City | Colors Reported | Shape Reported | State | Time |
|---|---|---|---|---|
| str | str | str | str | str |
| "Ithaca" | "UNKNOWN" | "TRIANGLE" | "NY" | "6/1/1930 22:00" |
| "Willingboro" | "UNKNOWN" | "OTHER" | "NJ" | "6/30/1930 20:00" |
| "Holyoke" | "UNKNOWN" | "OVAL" | "CO" | "2/15/1931 14:00" |
| "Abilene" | "UNKNOWN" | "DISK" | "KS" | "6/1/1931 13:00" |
| "New York Worlds Fair" | "UNKNOWN" | "LIGHT" | "NY" | "4/18/1933 19:00" |
| … | … | … | … | … |
| "Grant Park" | "UNKNOWN" | "TRIANGLE" | "IL" | "12/31/2000 23:00" |
| "Spirit Lake" | "UNKNOWN" | "DISK" | "IA" | "12/31/2000 23:00" |
| "Eagle River" | "UNKNOWN" | "VARIOUS" | "WI" | "12/31/2000 23:45" |
| "Eagle River" | "RED" | "LIGHT" | "WI" | "12/31/2000 23:45" |
| "Ybor" | "UNKNOWN" | "OVAL" | "FL" | "12/31/2000 23:59" |
# confirm that the missing values were filled in
ufo["Shape Reported"].value_counts().head()
shape: (5, 2)
| Shape Reported | count |
|---|---|
| str | u32 |
| "DIAMOND" | 234 |
| "TRIANGLE" | 1889 |
| "TEARDROP" | 119 |
| "LIGHT" | 2803 |
| "DELTA" | 7 |
# fill missing values using "backward fill" strategy (doesn't affect the DataFrame since inplace=False)
ufo.fill_null(strategy="forward").tail()
shape: (5, 5)
| City | Colors Reported | Shape Reported | State | Time |
|---|---|---|---|---|
| str | str | str | str | str |
| "Grant Park" | "RED" | "TRIANGLE" | "IL" | "12/31/2000 23:00" |
| "Spirit Lake" | "RED" | "DISK" | "IA" | "12/31/2000 23:00" |
| "Eagle River" | "RED" | "DISK" | "WI" | "12/31/2000 23:45" |
| "Eagle River" | "RED" | "LIGHT" | "WI" | "12/31/2000 23:45" |
| "Ybor" | "RED" | "OVAL" | "FL" | "12/31/2000 23:59" |
# compare with "forward fill" strategy (doesn't affect the DataFrame since inplace=False)
ufo.fill_null(strategy="backward").tail()
shape: (5, 5)
| City | Colors Reported | Shape Reported | State | Time |
|---|---|---|---|---|
| str | str | str | str | str |
| "Grant Park" | "RED" | "TRIANGLE" | "IL" | "12/31/2000 23:00" |
| "Spirit Lake" | "RED" | "DISK" | "IA" | "12/31/2000 23:00" |
| "Eagle River" | "RED" | "LIGHT" | "WI" | "12/31/2000 23:45" |
| "Eagle River" | "RED" | "LIGHT" | "WI" | "12/31/2000 23:45" |
| "Ybor" | null | "OVAL" | "FL" | "12/31/2000 23:59" |
Outliers (clip)#
drinks["wine_servings"].clip(10, 300)
shape: (193,)
| wine_servings |
|---|
| i64 |
| 10 |
| 54 |
| 14 |
| 300 |
| 45 |
| … |
| 10 |
| 10 |
| 10 |
| 10 |
| 10 |
String & Time#
drinks = pl.read_csv("data/drinks.csv")
drinks.head()
shape: (5, 6)
| country | beer_servings | spirit_servings | wine_servings | total_litres_of_pure_alcohol | continent |
|---|---|---|---|---|---|
| str | i64 | i64 | i64 | f64 | str |
| "Afghanistan" | 0 | 0 | 0 | 0.0 | "Asia" |
| "Albania" | 89 | 132 | 54 | 4.9 | "Europe" |
| "Algeria" | 25 | 0 | 14 | 0.7 | "Africa" |
| "Andorra" | 245 | 138 | 312 | 12.4 | "Europe" |
| "Angola" | 217 | 57 | 45 | 5.9 | "Africa" |
Use string methods#
orders = pl.read_csv("data/chipotle.tsv", separator="\t")
orders.head()
shape: (5, 5)
| order_id | quantity | item_name | choice_description | item_price |
|---|---|---|---|---|
| i64 | i64 | str | str | str |
| 1 | 1 | "Chips and Fresh Tomato Salsa" | "NULL" | "$2.39" |
| 1 | 1 | "Izze" | "[Clementine]" | "$3.39" |
| 1 | 1 | "Nantucket Nectar" | "[Apple]" | "$3.39" |
| 1 | 1 | "Chips and Tomatillo-Green Chil… | "NULL" | "$2.39" |
| 2 | 2 | "Chicken Bowl" | "[Tomatillo-Red Chili Salsa (Ho… | "$16.98" |
# string methods for pandas Series are accessed via 'str'
orders["item_name"].str.to_uppercase().head()
shape: (10,)
| item_name |
|---|
| str |
| "CHIPS AND FRESH TOMATO SALSA" |
| "IZZE" |
| "NANTUCKET NECTAR" |
| "CHIPS AND TOMATILLO-GREEN CHIL… |
| "CHICKEN BOWL" |
| "CHICKEN BOWL" |
| "SIDE OF CHIPS" |
| "STEAK BURRITO" |
| "STEAK SOFT TACOS" |
| "STEAK BURRITO" |
# use the boolean Series to filter the DataFrame
orders.filter(pl.col("item_name").str.contains("Chicken")).head()
shape: (5, 5)
| order_id | quantity | item_name | choice_description | item_price |
|---|---|---|---|---|
| i64 | i64 | str | str | str |
| 2 | 2 | "Chicken Bowl" | "[Tomatillo-Red Chili Salsa (Ho… | "$16.98" |
| 3 | 1 | "Chicken Bowl" | "[Fresh Tomato Salsa (Mild), [R… | "$10.98" |
| 6 | 1 | "Chicken Crispy Tacos" | "[Roasted Chili Corn Salsa, [Fa… | "$8.75" |
| 6 | 1 | "Chicken Soft Tacos" | "[Roasted Chili Corn Salsa, [Ri… | "$8.75" |
| 7 | 1 | "Chicken Bowl" | "[Fresh Tomato Salsa, [Fajita V… | "$11.25" |
# many pandas string methods support regular expressions (regex)
orders["choice_description"].str.replace(r"[\[\]]", "").head()
shape: (10,)
| choice_description |
|---|
| str |
| "NULL" |
| "Clementine]" |
| "Apple]" |
| "NULL" |
| "Tomatillo-Red Chili Salsa (Hot… |
| "Fresh Tomato Salsa (Mild), [Ri… |
| "NULL" |
| "Tomatillo Red Chili Salsa, [Fa… |
| "Tomatillo Green Chili Salsa, [… |
| "Fresh Tomato Salsa, [Rice, Bla… |
Dates & Times#
# read a dataset of UFO reports into a DataFrame
ufo = pl.read_csv("data/ufo.csv")
ufo.head()
shape: (5, 5)
| City | Colors Reported | Shape Reported | State | Time |
|---|---|---|---|---|
| str | str | str | str | str |
| "Ithaca" | null | "TRIANGLE" | "NY" | "6/1/1930 22:00" |
| "Willingboro" | null | "OTHER" | "NJ" | "6/30/1930 20:00" |
| "Holyoke" | null | "OVAL" | "CO" | "2/15/1931 14:00" |
| "Abilene" | null | "DISK" | "KS" | "6/1/1931 13:00" |
| "New York Worlds Fair" | null | "LIGHT" | "NY" | "4/18/1933 19:00" |
# 'Time' is currently stored as a string
ufo.dtypes
[String, String, String, String, String]
Category data#
drinks = pl.read_csv("data/drinks.csv")
drinks.head()
shape: (5, 6)
| country | beer_servings | spirit_servings | wine_servings | total_litres_of_pure_alcohol | continent |
|---|---|---|---|---|---|
| str | i64 | i64 | i64 | f64 | str |
| "Afghanistan" | 0 | 0 | 0 | 0.0 | "Asia" |
| "Albania" | 89 | 132 | 54 | 4.9 | "Europe" |
| "Algeria" | 25 | 0 | 14 | 0.7 | "Africa" |
| "Andorra" | 245 | 138 | 312 | 12.4 | "Europe" |
| "Angola" | 217 | 57 | 45 | 5.9 | "Africa" |
# use the 'category' data type to store the 'continent' strings as integers
drinks = drinks.with_columns(pl.col("continent").cast(pl.Categorical))
drinks.dtypes
[String, Int64, Int64, Int64, Float64, Categorical]
# strings are now encoded (0 means 'Africa', 1 means 'Asia', 2 means 'Europe', etc.)
drinks["continent"].to_physical().head()
shape: (10,)
| continent |
|---|
| u32 |
| 0 |
| 1 |
| 2 |
| 1 |
| 2 |
| 3 |
| 4 |
| 1 |
| 5 |
| 1 |
# repeat this process for the 'country' Series
drinks = drinks.with_columns(pl.col("country").cast(pl.Categorical))
# memory usage increased because we created 193 categories
drinks["country"].cat.get_categories()
shape: (199,)
| country |
|---|
| str |
| "Asia" |
| "Europe" |
| "Africa" |
| "North America" |
| "South America" |
| … |
| "Paraguay" |
| "Peru" |
| "Philippines" |
| "Poland" |
| "Portugal" |
Dummy variables#
train = pl.read_csv("data/titanic_train.csv")
train.head()
shape: (5, 12)
| PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked |
|---|---|---|---|---|---|---|---|---|---|---|---|
| i64 | i64 | i64 | str | str | f64 | i64 | i64 | str | f64 | str | str |
| 1 | 0 | 3 | "Braund, Mr. Owen Harris" | "male" | 22.0 | 1 | 0 | "A/5 21171" | 7.25 | null | "S" |
| 2 | 1 | 1 | "Cumings, Mrs. John Bradley (Fl… | "female" | 38.0 | 1 | 0 | "PC 17599" | 71.2833 | "C85" | "C" |
| 3 | 1 | 3 | "Heikkinen, Miss. Laina" | "female" | 26.0 | 0 | 0 | "STON/O2. 3101282" | 7.925 | null | "S" |
| 4 | 1 | 1 | "Futrelle, Mrs. Jacques Heath (… | "female" | 35.0 | 1 | 0 | "113803" | 53.1 | "C123" | "S" |
| 5 | 0 | 3 | "Allen, Mr. William Henry" | "male" | 35.0 | 0 | 0 | "373450" | 8.05 | null | "S" |
# use 'get_dummies' to create one column for every possible value
train.to_dummies(["Sex"]).head()
shape: (5, 13)
| PassengerId | Survived | Pclass | Name | Sex_female | Sex_male | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| i64 | i64 | i64 | str | u8 | u8 | f64 | i64 | i64 | str | f64 | str | str |
| 1 | 0 | 3 | "Braund, Mr. Owen Harris" | 0 | 1 | 22.0 | 1 | 0 | "A/5 21171" | 7.25 | null | "S" |
| 2 | 1 | 1 | "Cumings, Mrs. John Bradley (Fl… | 1 | 0 | 38.0 | 1 | 0 | "PC 17599" | 71.2833 | "C85" | "C" |
| 3 | 1 | 3 | "Heikkinen, Miss. Laina" | 1 | 0 | 26.0 | 0 | 0 | "STON/O2. 3101282" | 7.925 | null | "S" |
| 4 | 1 | 1 | "Futrelle, Mrs. Jacques Heath (… | 1 | 0 | 35.0 | 1 | 0 | "113803" | 53.1 | "C123" | "S" |
| 5 | 0 | 3 | "Allen, Mr. William Henry" | 0 | 1 | 35.0 | 0 | 0 | "373450" | 8.05 | null | "S" |
# drop the first dummy variable ('female') using the 'iloc' method
# add a prefix to identify the source of the dummy variables
train.to_dummies(["Sex"], separator=":")[:, 1:].head()
shape: (5, 12)
| Survived | Pclass | Name | Sex:female | Sex:male | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked |
|---|---|---|---|---|---|---|---|---|---|---|---|
| i64 | i64 | str | u8 | u8 | f64 | i64 | i64 | str | f64 | str | str |
| 0 | 3 | "Braund, Mr. Owen Harris" | 0 | 1 | 22.0 | 1 | 0 | "A/5 21171" | 7.25 | null | "S" |
| 1 | 1 | "Cumings, Mrs. John Bradley (Fl… | 1 | 0 | 38.0 | 1 | 0 | "PC 17599" | 71.2833 | "C85" | "C" |
| 1 | 3 | "Heikkinen, Miss. Laina" | 1 | 0 | 26.0 | 0 | 0 | "STON/O2. 3101282" | 7.925 | null | "S" |
| 1 | 1 | "Futrelle, Mrs. Jacques Heath (… | 1 | 0 | 35.0 | 1 | 0 | "113803" | 53.1 | "C123" | "S" |
| 0 | 3 | "Allen, Mr. William Henry" | 0 | 1 | 35.0 | 0 | 0 | "373450" | 8.05 | null | "S" |
# use 'get_dummies' with a feature that has 3 possible values
train.to_dummies(["Embarked"], separator=":").head(10)
shape: (10, 15)
| PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked:C | Embarked:Q | Embarked:S | Embarked:null |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| i64 | i64 | i64 | str | str | f64 | i64 | i64 | str | f64 | str | u8 | u8 | u8 | u8 |
| 1 | 0 | 3 | "Braund, Mr. Owen Harris" | "male" | 22.0 | 1 | 0 | "A/5 21171" | 7.25 | null | 0 | 0 | 1 | 0 |
| 2 | 1 | 1 | "Cumings, Mrs. John Bradley (Fl… | "female" | 38.0 | 1 | 0 | "PC 17599" | 71.2833 | "C85" | 1 | 0 | 0 | 0 |
| 3 | 1 | 3 | "Heikkinen, Miss. Laina" | "female" | 26.0 | 0 | 0 | "STON/O2. 3101282" | 7.925 | null | 0 | 0 | 1 | 0 |
| 4 | 1 | 1 | "Futrelle, Mrs. Jacques Heath (… | "female" | 35.0 | 1 | 0 | "113803" | 53.1 | "C123" | 0 | 0 | 1 | 0 |
| 5 | 0 | 3 | "Allen, Mr. William Henry" | "male" | 35.0 | 0 | 0 | "373450" | 8.05 | null | 0 | 0 | 1 | 0 |
| 6 | 0 | 3 | "Moran, Mr. James" | "male" | null | 0 | 0 | "330877" | 8.4583 | null | 0 | 1 | 0 | 0 |
| 7 | 0 | 1 | "McCarthy, Mr. Timothy J" | "male" | 54.0 | 0 | 0 | "17463" | 51.8625 | "E46" | 0 | 0 | 1 | 0 |
| 8 | 0 | 3 | "Palsson, Master. Gosta Leonard" | "male" | 2.0 | 3 | 1 | "349909" | 21.075 | null | 0 | 0 | 1 | 0 |
| 9 | 1 | 3 | "Johnson, Mrs. Oscar W (Elisabe… | "female" | 27.0 | 0 | 2 | "347742" | 11.1333 | null | 0 | 0 | 1 | 0 |
| 10 | 1 | 2 | "Nasser, Mrs. Nicholas (Adele A… | "female" | 14.0 | 1 | 0 | "237736" | 30.0708 | null | 1 | 0 | 0 | 0 |
# pass the DataFrame to 'get_dummies' and specify which columns to dummy (it drops the original columns)
train.to_dummies(["Sex", "Embarked"]).head()
shape: (5, 16)
| PassengerId | Survived | Pclass | Name | Sex_female | Sex_male | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked_C | Embarked_Q | Embarked_S | Embarked_null |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| i64 | i64 | i64 | str | u8 | u8 | f64 | i64 | i64 | str | f64 | str | u8 | u8 | u8 | u8 |
| 1 | 0 | 3 | "Braund, Mr. Owen Harris" | 0 | 1 | 22.0 | 1 | 0 | "A/5 21171" | 7.25 | null | 0 | 0 | 1 | 0 |
| 2 | 1 | 1 | "Cumings, Mrs. John Bradley (Fl… | 1 | 0 | 38.0 | 1 | 0 | "PC 17599" | 71.2833 | "C85" | 1 | 0 | 0 | 0 |
| 3 | 1 | 3 | "Heikkinen, Miss. Laina" | 1 | 0 | 26.0 | 0 | 0 | "STON/O2. 3101282" | 7.925 | null | 0 | 0 | 1 | 0 |
| 4 | 1 | 1 | "Futrelle, Mrs. Jacques Heath (… | 1 | 0 | 35.0 | 1 | 0 | "113803" | 53.1 | "C123" | 0 | 0 | 1 | 0 |
| 5 | 0 | 3 | "Allen, Mr. William Henry" | 0 | 1 | 35.0 | 0 | 0 | "373450" | 8.05 | null | 0 | 0 | 1 | 0 |
# use the 'drop_first' parameter (new in pandas 0.18) to drop the first dummy variable for each feature
train.to_dummies(["Sex", "Embarked"], drop_first=True).head()
shape: (5, 14)
| PassengerId | Survived | Pclass | Name | Sex_female | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked_C | Embarked_Q | Embarked_null |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| i64 | i64 | i64 | str | u8 | f64 | i64 | i64 | str | f64 | str | u8 | u8 | u8 |
| 1 | 0 | 3 | "Braund, Mr. Owen Harris" | 0 | 22.0 | 1 | 0 | "A/5 21171" | 7.25 | null | 0 | 0 | 0 |
| 2 | 1 | 1 | "Cumings, Mrs. John Bradley (Fl… | 1 | 38.0 | 1 | 0 | "PC 17599" | 71.2833 | "C85" | 1 | 0 | 0 |
| 3 | 1 | 3 | "Heikkinen, Miss. Laina" | 1 | 26.0 | 0 | 0 | "STON/O2. 3101282" | 7.925 | null | 0 | 0 | 0 |
| 4 | 1 | 1 | "Futrelle, Mrs. Jacques Heath (… | 1 | 35.0 | 1 | 0 | "113803" | 53.1 | "C123" | 0 | 0 | 0 |
| 5 | 0 | 3 | "Allen, Mr. William Henry" | 0 | 35.0 | 0 | 0 | "373450" | 8.05 | null | 0 | 0 | 0 |
Cardinal data#
# create a small DataFrame from a dictionary
small = pl.DataFrame({
"ID": [100, 101, 102, 103],
"quality": ["good", "good", "very good", "excellent"],
})
small.head()
shape: (4, 2)
| ID | quality |
|---|---|
| i64 | str |
| 100 | "good" |
| 101 | "good" |
| 102 | "very good" |
| 103 | "excellent" |
# define a logical ordering for the categories
small.with_columns([pl.col("quality").cast(pl.Categorical("lexical"))])
small["quality"]
shape: (4,)
| quality |
|---|
| str |
| "good" |
| "good" |
| "very good" |
| "excellent" |
# sort the DataFrame by the 'quality' Series (logical order)
small.sort(by="quality")
shape: (4, 2)
| ID | quality |
|---|---|
| i64 | str |
| 103 | "excellent" |
| 100 | "good" |
| 101 | "good" |
| 102 | "very good" |
# comparison operators work with ordered categories
small.filter(pl.col("quality") > "good")
shape: (1, 2)
| ID | quality |
|---|---|
| i64 | str |
| 102 | "very good" |
Sample rows#
# use the 'seed' parameter for reproducibility
ufo.sample(n=3, seed=42)
shape: (3, 5)
| City | Colors Reported | Shape Reported | State | Time |
|---|---|---|---|---|
| str | str | str | str | str |
| "Ellis" | null | "OVAL" | "KS" | "10/15/1999 20:30" |
| "Cleveland" | null | "CIRCLE" | "OH" | "5/1/1988 16:00" |
| "Sidney" | null | "OTHER" | "TX" | "11/15/2000 18:00" |
# sample 75% of the DataFrame's rows without replacement
train = ufo.sample(fraction=0.75, seed=99)