读写和过滤#
import polars as pl
Read & Info#
Read tabular data#
user_cols = ["user_id", "age", "gender", "occupation", "zip_code"]
users = pl.read_csv("data/movie.user", separator="|", has_header=False)
users.columns = user_cols
# examine the first 5 rows
users.head()
shape: (5, 5)
user_id | age | gender | occupation | zip_code |
---|---|---|---|---|
i64 | i64 | str | str | str |
1 | 24 | "M" | "technician" | "85711" |
2 | 53 | "F" | "other" | "94043" |
3 | 23 | "M" | "writer" | "32067" |
4 | 24 | "M" | "technician" | "43537" |
5 | 33 | "F" | "other" | "15213" |
# specify which columns to include by name
pl.read_csv("data/ufo.csv")["City", "State"].head()
shape: (5, 2)
City | State |
---|---|
str | str |
"Ithaca" | "NY" |
"Willingboro" | "NJ" |
"Holyoke" | "CO" |
"Abilene" | "KS" |
"New York Worlds Fair" | "NY" |
# specify columns by position
ufo = pl.read_csv("data/ufo.csv")
ufo[:, [0, 4]].head()
shape: (5, 2)
City | Time |
---|---|
str | str |
"Ithaca" | "6/1/1930 22:00" |
"Willingboro" | "6/30/1930 20:00" |
"Holyoke" | "2/15/1931 14:00" |
"Abilene" | "6/1/1931 13:00" |
"New York Worlds Fair" | "4/18/1933 19:00" |
Describe data#
movies = pl.read_csv("data/imdb_1000.csv")
movies.head()
shape: (5, 6)
star_rating | title | content_rating | genre | duration | actors_list |
---|---|---|---|---|---|
f64 | str | str | str | i64 | str |
9.3 | "The Shawshank Redemption" | "R" | "Crime" | 142 | "[u'Tim Robbins', u'Morgan Free… |
9.2 | "The Godfather" | "R" | "Crime" | 175 | "[u'Marlon Brando', u'Al Pacino… |
9.1 | "The Godfather: Part II" | "R" | "Crime" | 200 | "[u'Al Pacino', u'Robert De Nir… |
9.0 | "The Dark Knight" | "PG-13" | "Action" | 152 | "[u'Christian Bale', u'Heath Le… |
8.9 | "Pulp Fiction" | "R" | "Crime" | 154 | "[u'John Travolta', u'Uma Thurm… |
# example attribute: number of rows and columns
movies.shape
(979, 6)
# example attribute: data type of each column
movies.dtypes
[Float64, String, String, String, Int64, String]
# example method: calculate summary statistics
movies.describe()
shape: (9, 7)
statistic | star_rating | title | content_rating | genre | duration | actors_list |
---|---|---|---|---|---|---|
str | f64 | str | str | str | f64 | str |
"count" | 979.0 | "979" | "976" | "979" | 979.0 | "979" |
"null_count" | 0.0 | "0" | "3" | "0" | 0.0 | "0" |
"mean" | 7.889785 | null | null | null | 120.979571 | null |
"std" | 0.336069 | null | null | null | 26.21801 | null |
"min" | 7.4 | "(500) Days of Summer" | "APPROVED" | "Action" | 64.0 | "[u"Brian O'Halloran", u'Jeff A… |
"25%" | 7.6 | null | null | null | 102.0 | null |
"50%" | 7.8 | null | null | null | 117.0 | null |
"75%" | 8.1 | null | null | null | 134.0 | null |
"max" | 9.3 | "[Rec]" | "X" | "Western" | 242.0 | "[u'Zooey Deschanel', u'Joseph … |
movies.select(pl.selectors.numeric()).describe()
shape: (9, 3)
statistic | star_rating | duration |
---|---|---|
str | f64 | f64 |
"count" | 979.0 | 979.0 |
"null_count" | 0.0 | 0.0 |
"mean" | 7.889785 | 120.979571 |
"std" | 0.336069 | 26.21801 |
"min" | 7.4 | 64.0 |
"25%" | 7.6 | 102.0 |
"50%" | 7.8 | 117.0 |
"75%" | 8.1 | 134.0 |
"max" | 9.3 | 242.0 |
Column Manipulation#
Create a new column#
# create a new 'Location' Series (must use bracket notation to define the Series name)
ufo2 = ufo.with_columns(Location=(pl.col("City") + ", " + pl.col("State")))
ufo2.head()
shape: (5, 6)
City | Colors Reported | Shape Reported | State | Time | Location |
---|---|---|---|---|---|
str | str | str | str | str | str |
"Ithaca" | null | "TRIANGLE" | "NY" | "6/1/1930 22:00" | "Ithaca, NY" |
"Willingboro" | null | "OTHER" | "NJ" | "6/30/1930 20:00" | "Willingboro, NJ" |
"Holyoke" | null | "OVAL" | "CO" | "2/15/1931 14:00" | "Holyoke, CO" |
"Abilene" | null | "DISK" | "KS" | "6/1/1931 13:00" | "Abilene, KS" |
"New York Worlds Fair" | null | "LIGHT" | "NY" | "4/18/1933 19:00" | "New York Worlds Fair, NY" |
Rename columns#
ufo = pl.read_csv("data/ufo.csv")
# examine the column names
ufo.columns
['City', 'Colors Reported', 'Shape Reported', 'State', 'Time']
# rename two of the columns by using the 'rename' method
ufo = ufo.rename({
"Colors Reported": "Colors_Reported",
"Shape Reported": "Shape_Reported",
})
ufo.columns
['City', 'Colors_Reported', 'Shape_Reported', 'State', 'Time']
# replace all of the column names by overwriting the 'columns' attribute
ufo_cols = ["city", "colors reported", "shape reported", "state", "time"]
ufo.columns = ufo_cols
ufo.columns
['city', 'colors reported', 'shape reported', 'state', 'time']
# replace the column names during the file reading process by using the 'names' parameter
ufo = pl.read_csv("data/ufo.csv", has_header=False)
ufo.columns = ufo_cols
ufo.columns
['city', 'colors reported', 'shape reported', 'state', 'time']
# replace all spaces with underscores in the column names by using the 'str.replace' method
ufo.columns = pl.Series(ufo.columns).str.replace(" ", "_").to_list()
ufo.columns
['city', 'colors_reported', 'shape_reported', 'state', 'time']
Remove columns#
ufo = pl.read_csv("data/ufo.csv")
ufo.head()
shape: (5, 5)
City | Colors Reported | Shape Reported | State | Time |
---|---|---|---|---|
str | str | str | str | str |
"Ithaca" | null | "TRIANGLE" | "NY" | "6/1/1930 22:00" |
"Willingboro" | null | "OTHER" | "NJ" | "6/30/1930 20:00" |
"Holyoke" | null | "OVAL" | "CO" | "2/15/1931 14:00" |
"Abilene" | null | "DISK" | "KS" | "6/1/1931 13:00" |
"New York Worlds Fair" | null | "LIGHT" | "NY" | "4/18/1933 19:00" |
# remove a single column (axis=1 refers to columns)
ufo = ufo.drop("Colors Reported")
ufo.head()
shape: (5, 4)
City | Shape Reported | State | Time |
---|---|---|---|
str | str | str | str |
"Ithaca" | "TRIANGLE" | "NY" | "6/1/1930 22:00" |
"Willingboro" | "OTHER" | "NJ" | "6/30/1930 20:00" |
"Holyoke" | "OVAL" | "CO" | "2/15/1931 14:00" |
"Abilene" | "DISK" | "KS" | "6/1/1931 13:00" |
"New York Worlds Fair" | "LIGHT" | "NY" | "4/18/1933 19:00" |
# remove multiple columns at once
ufo.drop(["City", "State"]).head()
shape: (5, 2)
Shape Reported | Time |
---|---|
str | str |
"TRIANGLE" | "6/1/1930 22:00" |
"OTHER" | "6/30/1930 20:00" |
"OVAL" | "2/15/1931 14:00" |
"DISK" | "6/1/1931 13:00" |
"LIGHT" | "4/18/1933 19:00" |
Remove rows#
# new way to drop rows: specify index
# ufo.drop(index=[0, 1]).head()
Select#
Iterate#
ufo = pl.read_csv("data/ufo.csv")
ufo.head()
shape: (5, 5)
City | Colors Reported | Shape Reported | State | Time |
---|---|---|---|---|
str | str | str | str | str |
"Ithaca" | null | "TRIANGLE" | "NY" | "6/1/1930 22:00" |
"Willingboro" | null | "OTHER" | "NJ" | "6/30/1930 20:00" |
"Holyoke" | null | "OVAL" | "CO" | "2/15/1931 14:00" |
"Abilene" | null | "DISK" | "KS" | "6/1/1931 13:00" |
"New York Worlds Fair" | null | "LIGHT" | "NY" | "4/18/1933 19:00" |
# various methods are available to iterate through a DataFrame
for row in ufo[:10].iter_rows(named=True):
print(row["City"], row["State"])
Ithaca NY
Willingboro NJ
Holyoke CO
Abilene KS
New York Worlds Fair NY
Valley City ND
Crater Lake CA
Alma MI
Eklutna AK
Hubbard OR
Index#
drinks = pl.read_csv("data/drinks.csv")
drinks.head()
shape: (5, 6)
country | beer_servings | spirit_servings | wine_servings | total_litres_of_pure_alcohol | continent |
---|---|---|---|---|---|
str | i64 | i64 | i64 | f64 | str |
"Afghanistan" | 0 | 0 | 0 | 0.0 | "Asia" |
"Albania" | 89 | 132 | 54 | 4.9 | "Europe" |
"Algeria" | 25 | 0 | 14 | 0.7 | "Africa" |
"Andorra" | 245 | 138 | 312 | 12.4 | "Europe" |
"Angola" | 217 | 57 | 45 | 5.9 | "Africa" |
# every DataFrame has an index (sometimes called the "row labels")
drinks.with_row_index()["index"]
shape: (193,)
index |
---|
u32 |
0 |
1 |
2 |
3 |
4 |
… |
188 |
189 |
190 |
191 |
192 |
Select a column#
# select the 'City' Series using bracket notation
drinks["country"]
shape: (193,)
country |
---|
str |
"Afghanistan" |
"Albania" |
"Algeria" |
"Andorra" |
"Angola" |
… |
"Venezuela" |
"Vietnam" |
"Yemen" |
"Zambia" |
"Zimbabwe" |
Select numeric columns#
drinks.dtypes
[String, Int64, Int64, Int64, Float64, String]
drinks.select(pl.selectors.numeric()).dtypes
[Int64, Int64, Int64, Float64]
Select rows and columns#
# row 0, all columns
drinks[0, :]
shape: (1, 6)
country | beer_servings | spirit_servings | wine_servings | total_litres_of_pure_alcohol | continent |
---|---|---|---|---|---|
str | i64 | i64 | i64 | f64 | str |
"Afghanistan" | 0 | 0 | 0 | 0.0 | "Asia" |
# rows 0 and 1 and 2, all columns
drinks[[0, 1, 2], :]
shape: (3, 6)
country | beer_servings | spirit_servings | wine_servings | total_litres_of_pure_alcohol | continent |
---|---|---|---|---|---|
str | i64 | i64 | i64 | f64 | str |
"Afghanistan" | 0 | 0 | 0 | 0.0 | "Asia" |
"Albania" | 89 | 132 | 54 | 4.9 | "Europe" |
"Algeria" | 25 | 0 | 14 | 0.7 | "Africa" |
# rows 0 through 2 (inclusive), all columns
drinks[0:2, :]
shape: (2, 6)
country | beer_servings | spirit_servings | wine_servings | total_litres_of_pure_alcohol | continent |
---|---|---|---|---|---|
str | i64 | i64 | i64 | f64 | str |
"Afghanistan" | 0 | 0 | 0 | 0.0 | "Asia" |
"Albania" | 89 | 132 | 54 | 4.9 | "Europe" |
# rows 0 through 2 (inclusive), column 'City'
drinks[0:2, "country"]
shape: (2,)
country |
---|
str |
"Afghanistan" |
"Albania" |
# rows 0 through 2 (inclusive), columns 'City' and 'State'
drinks[0:2, ["country", "continent"]]
shape: (2, 2)
country | continent |
---|---|
str | str |
"Afghanistan" | "Asia" |
"Albania" | "Europe" |
# rows 0 through 2 (inclusive), columns 'City' through 'State' (inclusive)
drinks[0:2, "country":"continent"]
shape: (2, 6)
country | beer_servings | spirit_servings | wine_servings | total_litres_of_pure_alcohol | continent |
---|---|---|---|---|---|
str | i64 | i64 | i64 | f64 | str |
"Afghanistan" | 0 | 0 | 0 | 0.0 | "Asia" |
"Albania" | 89 | 132 | 54 | 4.9 | "Europe" |
# rows in positions 0 and 1, columns in positions 0 and 3
drinks[[0, 1], [0, 3]]
shape: (2, 2)
country | wine_servings |
---|---|
str | i64 |
"Afghanistan" | 0 |
"Albania" | 54 |
# rows in positions 0 through 2 (exclusive), columns in positions 0 through 4 (exclusive)
drinks[0:2, 0:4]
shape: (2, 4)
country | beer_servings | spirit_servings | wine_servings |
---|---|---|---|
str | i64 | i64 | i64 |
"Afghanistan" | 0 | 0 | 0 |
"Albania" | 89 | 132 | 54 |
# rows in positions 0 through 2 (exclusive), all columns
drinks[0:2, :]
shape: (2, 6)
country | beer_servings | spirit_servings | wine_servings | total_litres_of_pure_alcohol | continent |
---|---|---|---|---|---|
str | i64 | i64 | i64 | f64 | str |
"Afghanistan" | 0 | 0 | 0 | 0.0 | "Asia" |
"Albania" | 89 | 132 | 54 | 4.9 | "Europe" |
Filter#
Column filter#
drinks.select(["beer_servings", "continent"])
shape: (193, 2)
beer_servings | continent |
---|---|
i64 | str |
0 | "Asia" |
89 | "Europe" |
25 | "Africa" |
245 | "Europe" |
217 | "Africa" |
… | … |
333 | "South America" |
111 | "Asia" |
6 | "Asia" |
32 | "Africa" |
64 | "Africa" |
drinks.select(pl.selectors.contains("servings"))
shape: (193, 3)
beer_servings | spirit_servings | wine_servings |
---|---|---|
i64 | i64 | i64 |
0 | 0 | 0 |
89 | 132 | 54 |
25 | 0 | 14 |
245 | 138 | 312 |
217 | 57 | 45 |
… | … | … |
333 | 100 | 3 |
111 | 2 | 1 |
6 | 0 | 0 |
32 | 19 | 4 |
64 | 18 | 4 |
Index filter#
drinks.with_row_index().filter(pl.col("index") % 2 == 0)
shape: (97, 7)
index | country | beer_servings | spirit_servings | wine_servings | total_litres_of_pure_alcohol | continent |
---|---|---|---|---|---|---|
u32 | str | i64 | i64 | i64 | f64 | str |
0 | "Afghanistan" | 0 | 0 | 0 | 0.0 | "Asia" |
2 | "Algeria" | 25 | 0 | 14 | 0.7 | "Africa" |
4 | "Angola" | 217 | 57 | 45 | 5.9 | "Africa" |
6 | "Argentina" | 193 | 25 | 221 | 8.3 | "South America" |
8 | "Australia" | 261 | 72 | 212 | 10.4 | "Oceania" |
… | … | … | … | … | … | … |
184 | "USA" | 249 | 158 | 84 | 8.7 | "North America" |
186 | "Uzbekistan" | 25 | 101 | 8 | 2.4 | "Asia" |
188 | "Venezuela" | 333 | 100 | 3 | 7.7 | "South America" |
190 | "Yemen" | 6 | 0 | 0 | 0.1 | "Asia" |
192 | "Zimbabwe" | 64 | 18 | 4 | 4.7 | "Africa" |
Value filter#
drinks.filter(pl.col("continent") == "South America")
shape: (12, 6)
country | beer_servings | spirit_servings | wine_servings | total_litres_of_pure_alcohol | continent |
---|---|---|---|---|---|
str | i64 | i64 | i64 | f64 | str |
"Argentina" | 193 | 25 | 221 | 8.3 | "South America" |
"Bolivia" | 167 | 41 | 8 | 3.8 | "South America" |
"Brazil" | 245 | 145 | 16 | 7.2 | "South America" |
"Chile" | 130 | 124 | 172 | 7.6 | "South America" |
"Colombia" | 159 | 76 | 3 | 4.2 | "South America" |
… | … | … | … | … | … |
"Paraguay" | 213 | 117 | 74 | 7.3 | "South America" |
"Peru" | 163 | 160 | 21 | 6.1 | "South America" |
"Suriname" | 128 | 178 | 7 | 5.6 | "South America" |
"Uruguay" | 115 | 35 | 220 | 6.6 | "South America" |
"Venezuela" | 333 | 100 | 3 | 7.7 | "South America" |
movies = pl.read_csv("data/imdb_1000.csv")
movies.head()
shape: (5, 6)
star_rating | title | content_rating | genre | duration | actors_list |
---|---|---|---|---|---|
f64 | str | str | str | i64 | str |
9.3 | "The Shawshank Redemption" | "R" | "Crime" | 142 | "[u'Tim Robbins', u'Morgan Free… |
9.2 | "The Godfather" | "R" | "Crime" | 175 | "[u'Marlon Brando', u'Al Pacino… |
9.1 | "The Godfather: Part II" | "R" | "Crime" | 200 | "[u'Al Pacino', u'Robert De Nir… |
9.0 | "The Dark Knight" | "PG-13" | "Action" | 152 | "[u'Christian Bale', u'Heath Le… |
8.9 | "Pulp Fiction" | "R" | "Crime" | 154 | "[u'John Travolta', u'Uma Thurm… |
# use the '&' operator to specify that both conditions are required
movies.filter((pl.col("duration") >= 200) & (pl.col("genre") == "Drama"))
shape: (3, 6)
star_rating | title | content_rating | genre | duration | actors_list |
---|---|---|---|---|---|
f64 | str | str | str | i64 | str |
8.7 | "Seven Samurai" | "UNRATED" | "Drama" | 207 | "[u'Toshir\xf4 Mifune', u'Takas… |
8.2 | "Gone with the Wind" | "G" | "Drama" | 238 | "[u'Clark Gable', u'Vivien Leig… |
7.8 | "Hamlet" | "PG-13" | "Drama" | 242 | "[u'Kenneth Branagh', u'Julie C… |
# using the '|' operator would have shown movies that are either long or dramas (or both)
movies.filter((pl.col("duration") >= 200) | (pl.col("genre") == "Drama"))
shape: (287, 6)
star_rating | title | content_rating | genre | duration | actors_list |
---|---|---|---|---|---|
f64 | str | str | str | i64 | str |
9.1 | "The Godfather: Part II" | "R" | "Crime" | 200 | "[u'Al Pacino', u'Robert De Nir… |
8.9 | "12 Angry Men" | "NOT RATED" | "Drama" | 96 | "[u'Henry Fonda', u'Lee J. Cobb… |
8.9 | "The Lord of the Rings: The Ret… | "PG-13" | "Adventure" | 201 | "[u'Elijah Wood', u'Viggo Morte… |
8.9 | "Fight Club" | "R" | "Drama" | 139 | "[u'Brad Pitt', u'Edward Norton… |
8.8 | "Forrest Gump" | "PG-13" | "Drama" | 142 | "[u'Tom Hanks', u'Robin Wright'… |
… | … | … | … | … | … |
7.4 | "My Sister's Keeper" | "PG-13" | "Drama" | 109 | "[u'Cameron Diaz', u'Abigail Br… |
7.4 | "The English Patient" | "R" | "Drama" | 162 | "[u'Ralph Fiennes', u'Juliette … |
7.4 | "Wonder Boys" | "R" | "Drama" | 107 | "[u'Michael Douglas', u'Tobey M… |
7.4 | "Blue Valentine" | "NC-17" | "Drama" | 112 | "[u'Ryan Gosling', u'Michelle W… |
7.4 | "The Cider House Rules" | "PG-13" | "Drama" | 126 | "[u'Tobey Maguire', u'Charlize … |
Range filter#
movies.filter(pl.col("genre").is_in(["Crime", "Drama", "Action"])).head()
shape: (5, 6)
star_rating | title | content_rating | genre | duration | actors_list |
---|---|---|---|---|---|
f64 | str | str | str | i64 | str |
9.3 | "The Shawshank Redemption" | "R" | "Crime" | 142 | "[u'Tim Robbins', u'Morgan Free… |
9.2 | "The Godfather" | "R" | "Crime" | 175 | "[u'Marlon Brando', u'Al Pacino… |
9.1 | "The Godfather: Part II" | "R" | "Crime" | 200 | "[u'Al Pacino', u'Robert De Nir… |
9.0 | "The Dark Knight" | "PG-13" | "Action" | 152 | "[u'Christian Bale', u'Heath Le… |
8.9 | "Pulp Fiction" | "R" | "Crime" | 154 | "[u'John Travolta', u'Uma Thurm… |
movies.filter(pl.col("duration").is_between(100, 200, closed="none")).head()
shape: (5, 6)
star_rating | title | content_rating | genre | duration | actors_list |
---|---|---|---|---|---|
f64 | str | str | str | i64 | str |
9.3 | "The Shawshank Redemption" | "R" | "Crime" | 142 | "[u'Tim Robbins', u'Morgan Free… |
9.2 | "The Godfather" | "R" | "Crime" | 175 | "[u'Marlon Brando', u'Al Pacino… |
9.0 | "The Dark Knight" | "PG-13" | "Action" | 152 | "[u'Christian Bale', u'Heath Le… |
8.9 | "Pulp Fiction" | "R" | "Crime" | 154 | "[u'John Travolta', u'Uma Thurm… |
8.9 | "The Good, the Bad and the Ugly" | "NOT RATED" | "Western" | 161 | "[u'Clint Eastwood', u'Eli Wall… |
Reshape#
drinks = pl.read_csv("data/drinks.csv")
drinks.head()
shape: (5, 6)
country | beer_servings | spirit_servings | wine_servings | total_litres_of_pure_alcohol | continent |
---|---|---|---|---|---|
str | i64 | i64 | i64 | f64 | str |
"Afghanistan" | 0 | 0 | 0 | 0.0 | "Asia" |
"Albania" | 89 | 132 | 54 | 4.9 | "Europe" |
"Algeria" | 25 | 0 | 14 | 0.7 | "Africa" |
"Andorra" | 245 | 138 | 312 | 12.4 | "Europe" |
"Angola" | 217 | 57 | 45 | 5.9 | "Africa" |
# apply the 'max' function along axis 0 to calculate the maximum value in each column
drinks[:, "beer_servings":"wine_servings"].max()
shape: (1, 3)
beer_servings | spirit_servings | wine_servings |
---|---|---|
i64 | i64 | i64 |
376 | 438 | 370 |
# convert every DataFrame element into a float
drinks[:, "beer_servings":"wine_servings"].cast(float).head()
shape: (5, 3)
beer_servings | spirit_servings | wine_servings |
---|---|---|
f64 | f64 | f64 |
0.0 | 0.0 | 0.0 |
89.0 | 132.0 | 54.0 |
25.0 | 0.0 | 14.0 |
245.0 | 138.0 | 312.0 |
217.0 | 57.0 | 45.0 |
Concat#
df1 = pl.DataFrame({
"name": ["A", "B", "C", "D"],
"math": [60, 89, 82, 70],
"physics": [66, 95, 83, 66],
"chemistry": [61, 91, 77, 70],
})
df2 = pl.DataFrame({
"name": ["E", "F", "G", "H"],
"math": [66, 95, 83, 66],
"physics": [60, 89, 82, 70],
"chemistry": [90, 81, 78, 90],
})
df12 = pl.concat([df1, df2])
df12
shape: (8, 4)
name | math | physics | chemistry |
---|---|---|---|
str | i64 | i64 | i64 |
"A" | 60 | 66 | 61 |
"B" | 89 | 95 | 91 |
"C" | 82 | 83 | 77 |
"D" | 70 | 66 | 70 |
"E" | 66 | 60 | 90 |
"F" | 95 | 89 | 81 |
"G" | 83 | 82 | 78 |
"H" | 66 | 70 | 90 |
df12.select(sorted(df12.columns))
shape: (8, 4)
chemistry | math | name | physics |
---|---|---|---|
i64 | i64 | str | i64 |
61 | 60 | "A" | 66 |
91 | 89 | "B" | 95 |
77 | 82 | "C" | 83 |
70 | 70 | "D" | 66 |
90 | 66 | "E" | 60 |
81 | 95 | "F" | 89 |
78 | 83 | "G" | 82 |
90 | 66 | "H" | 70 |
Join#
movie_cols = ["movie_id", "title", "date"]
movies = pl.read_csv("data/u.item", separator="|", has_header=False)[:, :3]
movies.columns = movie_cols
movies.head()
shape: (5, 3)
movie_id | title | date |
---|---|---|
i64 | str | str |
1 | "Toy Story (1995)" | "01-Jan-1995" |
2 | "GoldenEye (1995)" | "01-Jan-1995" |
3 | "Four Rooms (1995)" | "01-Jan-1995" |
4 | "Get Shorty (1995)" | "01-Jan-1995" |
5 | "Copycat (1995)" | "01-Jan-1995" |
movies.shape
(1682, 3)
rating_cols = ["user_id", "movie_id", "rating", "timestamp"]
ratings = pl.read_csv("data/u.data", separator="\t", has_header=False)
ratings.columns = rating_cols
ratings.head()
shape: (5, 4)
user_id | movie_id | rating | timestamp |
---|---|---|---|
i64 | i64 | i64 | i64 |
196 | 242 | 3 | 881250949 |
186 | 302 | 3 | 891717742 |
22 | 377 | 1 | 878887116 |
244 | 51 | 2 | 880606923 |
166 | 346 | 1 | 886397596 |
ratings.shape
(100000, 4)
ratings.filter(pl.col("movie_id") == 1).head()
shape: (5, 4)
user_id | movie_id | rating | timestamp |
---|---|---|---|
i64 | i64 | i64 | i64 |
308 | 1 | 4 | 887736532 |
287 | 1 | 5 | 875334088 |
148 | 1 | 4 | 877019411 |
280 | 1 | 4 | 891700426 |
66 | 1 | 3 | 883601324 |
# Returns rows that have matching values in both tables
movies.join(ratings, on="movie_id").head()
shape: (5, 6)
movie_id | title | date | user_id | rating | timestamp |
---|---|---|---|---|---|
i64 | str | str | i64 | i64 | i64 |
242 | "Kolya (1996)" | "24-Jan-1997" | 196 | 3 | 881250949 |
302 | "L.A. Confidential (1997)" | "01-Jan-1997" | 186 | 3 | 891717742 |
377 | "Heavyweights (1994)" | "01-Jan-1994" | 22 | 1 | 878887116 |
51 | "Legends of the Fall (1994)" | "01-Jan-1994" | 244 | 2 | 880606923 |
346 | "Jackie Brown (1997)" | "01-Jan-1997" | 166 | 1 | 886397596 |
# Returns all rows from the left table, and the matched rows from the right table
movies.join(ratings, on="movie_id", how="left").head()
shape: (5, 6)
movie_id | title | date | user_id | rating | timestamp |
---|---|---|---|---|---|
i64 | str | str | i64 | i64 | i64 |
1 | "Toy Story (1995)" | "01-Jan-1995" | 308 | 4 | 887736532 |
1 | "Toy Story (1995)" | "01-Jan-1995" | 287 | 5 | 875334088 |
1 | "Toy Story (1995)" | "01-Jan-1995" | 148 | 4 | 877019411 |
1 | "Toy Story (1995)" | "01-Jan-1995" | 280 | 4 | 891700426 |
1 | "Toy Story (1995)" | "01-Jan-1995" | 66 | 3 | 883601324 |
# Returns all rows from the right table, and the matched rows from the left table
movies.join(ratings, on="movie_id", how="right").head()
shape: (5, 6)
title | date | user_id | movie_id | rating | timestamp |
---|---|---|---|---|---|
str | str | i64 | i64 | i64 | i64 |
"Kolya (1996)" | "24-Jan-1997" | 196 | 242 | 3 | 881250949 |
"L.A. Confidential (1997)" | "01-Jan-1997" | 186 | 302 | 3 | 891717742 |
"Heavyweights (1994)" | "01-Jan-1994" | 22 | 377 | 1 | 878887116 |
"Legends of the Fall (1994)" | "01-Jan-1994" | 244 | 51 | 2 | 880606923 |
"Jackie Brown (1997)" | "01-Jan-1997" | 166 | 346 | 1 | 886397596 |
# Returns all rows when there is a match in either left or right table
movies.join(ratings, on="movie_id", how="full").head()
shape: (5, 7)
movie_id | title | date | user_id | movie_id_right | rating | timestamp |
---|---|---|---|---|---|---|
i64 | str | str | i64 | i64 | i64 | i64 |
242 | "Kolya (1996)" | "24-Jan-1997" | 196 | 242 | 3 | 881250949 |
302 | "L.A. Confidential (1997)" | "01-Jan-1997" | 186 | 302 | 3 | 891717742 |
377 | "Heavyweights (1994)" | "01-Jan-1994" | 22 | 377 | 1 | 878887116 |
51 | "Legends of the Fall (1994)" | "01-Jan-1994" | 244 | 51 | 2 | 880606923 |
346 | "Jackie Brown (1997)" | "01-Jan-1997" | 166 | 346 | 1 | 886397596 |
# equals to inner join
movies.join(ratings, left_on="movie_id", right_on="movie_id").head()
shape: (5, 6)
movie_id | title | date | user_id | rating | timestamp |
---|---|---|---|---|---|
i64 | str | str | i64 | i64 | i64 |
242 | "Kolya (1996)" | "24-Jan-1997" | 196 | 3 | 881250949 |
302 | "L.A. Confidential (1997)" | "01-Jan-1997" | 186 | 3 | 891717742 |
377 | "Heavyweights (1994)" | "01-Jan-1994" | 22 | 1 | 878887116 |
51 | "Legends of the Fall (1994)" | "01-Jan-1994" | 244 | 2 | 880606923 |
346 | "Jackie Brown (1997)" | "01-Jan-1997" | 166 | 1 | 886397596 |
MultiIndexes#
# close = pl.read_csv(
# "data/stocks.csv", usecols=[0, 1, 3], index_col=["Symbol", "Date"]
# ).sort_index()
# close
# volume = pl.read_csv(
# "data/stocks.csv", usecols=[0, 2, 3], index_col=["Symbol", "Date"]
# ).sort_index()
# volume
Explode#
orders = pl.read_csv("data/chipotle.tsv", separator="\t")
orders.head()
shape: (5, 5)
order_id | quantity | item_name | choice_description | item_price |
---|---|---|---|---|
i64 | i64 | str | str | str |
1 | 1 | "Chips and Fresh Tomato Salsa" | "NULL" | "$2.39" |
1 | 1 | "Izze" | "[Clementine]" | "$3.39" |
1 | 1 | "Nantucket Nectar" | "[Apple]" | "$3.39" |
1 | 1 | "Chips and Tomatillo-Green Chil… | "NULL" | "$2.39" |
2 | 2 | "Chicken Bowl" | "[Tomatillo-Red Chili Salsa (Ho… | "$16.98" |
orders2 = orders.with_columns(pl.col("item_name").str.split(" "))
orders2.head()
shape: (5, 5)
order_id | quantity | item_name | choice_description | item_price |
---|---|---|---|---|
i64 | i64 | list[str] | str | str |
1 | 1 | ["Chips", "and", … "Salsa"] | "NULL" | "$2.39" |
1 | 1 | ["Izze"] | "[Clementine]" | "$3.39" |
1 | 1 | ["Nantucket", "Nectar"] | "[Apple]" | "$3.39" |
1 | 1 | ["Chips", "and", … "Salsa"] | "NULL" | "$2.39" |
2 | 2 | ["Chicken", "Bowl"] | "[Tomatillo-Red Chili Salsa (Ho… | "$16.98" |
orders2.explode("item_name")
shape: (11_501, 5)
order_id | quantity | item_name | choice_description | item_price |
---|---|---|---|---|
i64 | i64 | str | str | str |
1 | 1 | "Chips" | "NULL" | "$2.39" |
1 | 1 | "and" | "NULL" | "$2.39" |
1 | 1 | "Fresh" | "NULL" | "$2.39" |
1 | 1 | "Tomato" | "NULL" | "$2.39" |
1 | 1 | "Salsa" | "NULL" | "$2.39" |
… | … | … | … | … |
1834 | 1 | "Salad" | "[Fresh Tomato Salsa, [Fajita V… | "$8.75" |
1834 | 1 | "Bowl" | "[Fresh Tomato Salsa, [Fajita V… | "$8.75" |
1834 | 1 | "Chicken" | "[Fresh Tomato Salsa, [Fajita V… | "$8.75" |
1834 | 1 | "Salad" | "[Fresh Tomato Salsa, [Fajita V… | "$8.75" |
1834 | 1 | "Bowl" | "[Fresh Tomato Salsa, [Fajita V… | "$8.75" |