读写和过滤

读写和过滤#

import polars as pl

Read & Info#

Read tabular data#

user_cols = ["user_id", "age", "gender", "occupation", "zip_code"]
users = pl.read_csv("data/movie.user", separator="|", has_header=False)
users.columns = user_cols

# examine the first 5 rows
users.head()

shape: (5, 5)

user_id	age	gender	occupation	zip_code
i64	i64	str	str	str
1	24	"M"	"technician"	"85711"
2	53	"F"	"other"	"94043"
3	23	"M"	"writer"	"32067"
4	24	"M"	"technician"	"43537"
5	33	"F"	"other"	"15213"

# specify which columns to include by name
pl.read_csv("data/ufo.csv")["City", "State"].head()

shape: (5, 2)

City	State
str	str
"Ithaca"	"NY"
"Willingboro"	"NJ"
"Holyoke"	"CO"
"Abilene"	"KS"
"New York Worlds Fair"	"NY"

# specify columns by position
ufo = pl.read_csv("data/ufo.csv")
ufo[:, [0, 4]].head()

shape: (5, 2)

City	Time
str	str
"Ithaca"	"6/1/1930 22:00"
"Willingboro"	"6/30/1930 20:00"
"Holyoke"	"2/15/1931 14:00"
"Abilene"	"6/1/1931 13:00"
"New York Worlds Fair"	"4/18/1933 19:00"

Describe data#

movies = pl.read_csv("data/imdb_1000.csv")
movies.head()

shape: (5, 6)

star_rating	title	content_rating	genre	duration	actors_list
f64	str	str	str	i64	str
9.3	"The Shawshank Redemption"	"R"	"Crime"	142	"[u'Tim Robbins', u'Morgan Free…
9.2	"The Godfather"	"R"	"Crime"	175	"[u'Marlon Brando', u'Al Pacino…
9.1	"The Godfather: Part II"	"R"	"Crime"	200	"[u'Al Pacino', u'Robert De Nir…
9.0	"The Dark Knight"	"PG-13"	"Action"	152	"[u'Christian Bale', u'Heath Le…
8.9	"Pulp Fiction"	"R"	"Crime"	154	"[u'John Travolta', u'Uma Thurm…

# example attribute: number of rows and columns
movies.shape

(979, 6)

# example attribute: data type of each column
movies.dtypes

[Float64, String, String, String, Int64, String]

# example method: calculate summary statistics
movies.describe()

shape: (9, 7)

statistic	star_rating	title	content_rating	genre	duration	actors_list
str	f64	str	str	str	f64	str
"count"	979.0	"979"	"976"	"979"	979.0	"979"
"null_count"	0.0	"0"	"3"	"0"	0.0	"0"
"mean"	7.889785	null	null	null	120.979571	null
"std"	0.336069	null	null	null	26.21801	null
"min"	7.4	"(500) Days of Summer"	"APPROVED"	"Action"	64.0	"[u"Brian O'Halloran", u'Jeff A…
"25%"	7.6	null	null	null	102.0	null
"50%"	7.8	null	null	null	117.0	null
"75%"	8.1	null	null	null	134.0	null
"max"	9.3	"[Rec]"	"X"	"Western"	242.0	"[u'Zooey Deschanel', u'Joseph …

movies.select(pl.selectors.numeric()).describe()

shape: (9, 3)

statistic	star_rating	duration
str	f64	f64
"count"	979.0	979.0
"null_count"	0.0	0.0
"mean"	7.889785	120.979571
"std"	0.336069	26.21801
"min"	7.4	64.0
"25%"	7.6	102.0
"50%"	7.8	117.0
"75%"	8.1	134.0
"max"	9.3	242.0

Column Manipulation#

Create a new column#

# create a new 'Location' Series (must use bracket notation to define the Series name)
ufo2 = ufo.with_columns(Location=(pl.col("City") + ", " + pl.col("State")))
ufo2.head()

shape: (5, 6)

City	Colors Reported	Shape Reported	State	Time	Location
str	str	str	str	str	str
"Ithaca"	null	"TRIANGLE"	"NY"	"6/1/1930 22:00"	"Ithaca, NY"
"Willingboro"	null	"OTHER"	"NJ"	"6/30/1930 20:00"	"Willingboro, NJ"
"Holyoke"	null	"OVAL"	"CO"	"2/15/1931 14:00"	"Holyoke, CO"
"Abilene"	null	"DISK"	"KS"	"6/1/1931 13:00"	"Abilene, KS"
"New York Worlds Fair"	null	"LIGHT"	"NY"	"4/18/1933 19:00"	"New York Worlds Fair, NY"

Rename columns#

ufo = pl.read_csv("data/ufo.csv")

# examine the column names
ufo.columns

['City', 'Colors Reported', 'Shape Reported', 'State', 'Time']

# rename two of the columns by using the 'rename' method
ufo = ufo.rename({
    "Colors Reported": "Colors_Reported",
    "Shape Reported": "Shape_Reported",
})
ufo.columns

['City', 'Colors_Reported', 'Shape_Reported', 'State', 'Time']

# replace all of the column names by overwriting the 'columns' attribute
ufo_cols = ["city", "colors reported", "shape reported", "state", "time"]
ufo.columns = ufo_cols
ufo.columns

['city', 'colors reported', 'shape reported', 'state', 'time']

# replace the column names during the file reading process by using the 'names' parameter
ufo = pl.read_csv("data/ufo.csv", has_header=False)
ufo.columns = ufo_cols
ufo.columns

['city', 'colors reported', 'shape reported', 'state', 'time']

# replace all spaces with underscores in the column names by using the 'str.replace' method
ufo.columns = pl.Series(ufo.columns).str.replace(" ", "_").to_list()
ufo.columns

['city', 'colors_reported', 'shape_reported', 'state', 'time']

Remove columns#

ufo = pl.read_csv("data/ufo.csv")
ufo.head()

shape: (5, 5)

City	Colors Reported	Shape Reported	State	Time
str	str	str	str	str
"Ithaca"	null	"TRIANGLE"	"NY"	"6/1/1930 22:00"
"Willingboro"	null	"OTHER"	"NJ"	"6/30/1930 20:00"
"Holyoke"	null	"OVAL"	"CO"	"2/15/1931 14:00"
"Abilene"	null	"DISK"	"KS"	"6/1/1931 13:00"
"New York Worlds Fair"	null	"LIGHT"	"NY"	"4/18/1933 19:00"

# remove a single column (axis=1 refers to columns)
ufo = ufo.drop("Colors Reported")
ufo.head()

shape: (5, 4)

City	Shape Reported	State	Time
str	str	str	str
"Ithaca"	"TRIANGLE"	"NY"	"6/1/1930 22:00"
"Willingboro"	"OTHER"	"NJ"	"6/30/1930 20:00"
"Holyoke"	"OVAL"	"CO"	"2/15/1931 14:00"
"Abilene"	"DISK"	"KS"	"6/1/1931 13:00"
"New York Worlds Fair"	"LIGHT"	"NY"	"4/18/1933 19:00"

# remove multiple columns at once
ufo.drop(["City", "State"]).head()

shape: (5, 2)

Shape Reported	Time
str	str
"TRIANGLE"	"6/1/1930 22:00"
"OTHER"	"6/30/1930 20:00"
"OVAL"	"2/15/1931 14:00"
"DISK"	"6/1/1931 13:00"
"LIGHT"	"4/18/1933 19:00"

Remove rows#

# new way to drop rows: specify index
# ufo.drop(index=[0, 1]).head()

Select#

Iterate#

ufo = pl.read_csv("data/ufo.csv")
ufo.head()

shape: (5, 5)

City	Colors Reported	Shape Reported	State	Time
str	str	str	str	str
"Ithaca"	null	"TRIANGLE"	"NY"	"6/1/1930 22:00"
"Willingboro"	null	"OTHER"	"NJ"	"6/30/1930 20:00"
"Holyoke"	null	"OVAL"	"CO"	"2/15/1931 14:00"
"Abilene"	null	"DISK"	"KS"	"6/1/1931 13:00"
"New York Worlds Fair"	null	"LIGHT"	"NY"	"4/18/1933 19:00"

# various methods are available to iterate through a DataFrame
for row in ufo[:10].iter_rows(named=True):
    print(row["City"], row["State"])

Ithaca NY
Willingboro NJ
Holyoke CO
Abilene KS
New York Worlds Fair NY
Valley City ND
Crater Lake CA
Alma MI
Eklutna AK
Hubbard OR

Index#

drinks = pl.read_csv("data/drinks.csv")
drinks.head()

shape: (5, 6)

country	beer_servings	spirit_servings	wine_servings	total_litres_of_pure_alcohol	continent
str	i64	i64	i64	f64	str
"Afghanistan"	0	0	0	0.0	"Asia"
"Albania"	89	132	54	4.9	"Europe"
"Algeria"	25	0	14	0.7	"Africa"
"Andorra"	245	138	312	12.4	"Europe"
"Angola"	217	57	45	5.9	"Africa"

# every DataFrame has an index (sometimes called the "row labels")
drinks.with_row_index()["index"]

shape: (193,)

index
u32
0
1
2
3
4
…
188
189
190
191
192

Select a column#

# select the 'City' Series using bracket notation
drinks["country"]

shape: (193,)

country
str
"Afghanistan"
"Albania"
"Algeria"
"Andorra"
"Angola"
…
"Venezuela"
"Vietnam"
"Yemen"
"Zambia"
"Zimbabwe"

Select numeric columns#

drinks.dtypes

[String, Int64, Int64, Int64, Float64, String]

drinks.select(pl.selectors.numeric()).dtypes

[Int64, Int64, Int64, Float64]

Select rows and columns#

# row 0, all columns
drinks[0, :]

shape: (1, 6)

country	beer_servings	spirit_servings	wine_servings	total_litres_of_pure_alcohol	continent
str	i64	i64	i64	f64	str
"Afghanistan"	0	0	0	0.0	"Asia"

# rows 0 and 1 and 2, all columns
drinks[[0, 1, 2], :]

shape: (3, 6)

country	beer_servings	spirit_servings	wine_servings	total_litres_of_pure_alcohol	continent
str	i64	i64	i64	f64	str
"Afghanistan"	0	0	0	0.0	"Asia"
"Albania"	89	132	54	4.9	"Europe"
"Algeria"	25	0	14	0.7	"Africa"

# rows 0 through 2 (inclusive), all columns
drinks[0:2, :]

shape: (2, 6)

country	beer_servings	spirit_servings	wine_servings	total_litres_of_pure_alcohol	continent
str	i64	i64	i64	f64	str
"Afghanistan"	0	0	0	0.0	"Asia"
"Albania"	89	132	54	4.9	"Europe"

# rows 0 through 2 (inclusive), column 'City'
drinks[0:2, "country"]

shape: (2,)

country
str
"Afghanistan"
"Albania"

# rows 0 through 2 (inclusive), columns 'City' and 'State'
drinks[0:2, ["country", "continent"]]

shape: (2, 2)

country	continent
str	str
"Afghanistan"	"Asia"
"Albania"	"Europe"

# rows 0 through 2 (inclusive), columns 'City' through 'State' (inclusive)
drinks[0:2, "country":"continent"]

shape: (2, 6)

country	beer_servings	spirit_servings	wine_servings	total_litres_of_pure_alcohol	continent
str	i64	i64	i64	f64	str
"Afghanistan"	0	0	0	0.0	"Asia"
"Albania"	89	132	54	4.9	"Europe"

# rows in positions 0 and 1, columns in positions 0 and 3
drinks[[0, 1], [0, 3]]

shape: (2, 2)

country	wine_servings
str	i64
"Afghanistan"	0
"Albania"	54

# rows in positions 0 through 2 (exclusive), columns in positions 0 through 4 (exclusive)
drinks[0:2, 0:4]

shape: (2, 4)

country	beer_servings	spirit_servings	wine_servings
str	i64	i64	i64
"Afghanistan"	0	0	0
"Albania"	89	132	54

# rows in positions 0 through 2 (exclusive), all columns
drinks[0:2, :]

shape: (2, 6)

country	beer_servings	spirit_servings	wine_servings	total_litres_of_pure_alcohol	continent
str	i64	i64	i64	f64	str
"Afghanistan"	0	0	0	0.0	"Asia"
"Albania"	89	132	54	4.9	"Europe"

Filter#

Column filter#

drinks.select(["beer_servings", "continent"])

shape: (193, 2)

beer_servings	continent
i64	str
0	"Asia"
89	"Europe"
25	"Africa"
245	"Europe"
217	"Africa"
…	…
333	"South America"
111	"Asia"
6	"Asia"
32	"Africa"
64	"Africa"

drinks.select(pl.selectors.contains("servings"))

shape: (193, 3)

beer_servings	spirit_servings	wine_servings
i64	i64	i64
0	0	0
89	132	54
25	0	14
245	138	312
217	57	45
…	…	…
333	100	3
111	2	1
6	0	0
32	19	4
64	18	4

Index filter#

drinks.with_row_index().filter(pl.col("index") % 2 == 0)

shape: (97, 7)

index	country	beer_servings	spirit_servings	wine_servings	total_litres_of_pure_alcohol	continent
u32	str	i64	i64	i64	f64	str
0	"Afghanistan"	0	0	0	0.0	"Asia"
2	"Algeria"	25	0	14	0.7	"Africa"
4	"Angola"	217	57	45	5.9	"Africa"
6	"Argentina"	193	25	221	8.3	"South America"
8	"Australia"	261	72	212	10.4	"Oceania"
…	…	…	…	…	…	…
184	"USA"	249	158	84	8.7	"North America"
186	"Uzbekistan"	25	101	8	2.4	"Asia"
188	"Venezuela"	333	100	3	7.7	"South America"
190	"Yemen"	6	0	0	0.1	"Asia"
192	"Zimbabwe"	64	18	4	4.7	"Africa"

Value filter#

drinks.filter(pl.col("continent") == "South America")

shape: (12, 6)

country	beer_servings	spirit_servings	wine_servings	total_litres_of_pure_alcohol	continent
str	i64	i64	i64	f64	str
"Argentina"	193	25	221	8.3	"South America"
"Bolivia"	167	41	8	3.8	"South America"
"Brazil"	245	145	16	7.2	"South America"
"Chile"	130	124	172	7.6	"South America"
"Colombia"	159	76	3	4.2	"South America"
…	…	…	…	…	…
"Paraguay"	213	117	74	7.3	"South America"
"Peru"	163	160	21	6.1	"South America"
"Suriname"	128	178	7	5.6	"South America"
"Uruguay"	115	35	220	6.6	"South America"
"Venezuela"	333	100	3	7.7	"South America"

movies = pl.read_csv("data/imdb_1000.csv")
movies.head()

shape: (5, 6)

star_rating	title	content_rating	genre	duration	actors_list
f64	str	str	str	i64	str
9.3	"The Shawshank Redemption"	"R"	"Crime"	142	"[u'Tim Robbins', u'Morgan Free…
9.2	"The Godfather"	"R"	"Crime"	175	"[u'Marlon Brando', u'Al Pacino…
9.1	"The Godfather: Part II"	"R"	"Crime"	200	"[u'Al Pacino', u'Robert De Nir…
9.0	"The Dark Knight"	"PG-13"	"Action"	152	"[u'Christian Bale', u'Heath Le…
8.9	"Pulp Fiction"	"R"	"Crime"	154	"[u'John Travolta', u'Uma Thurm…

# use the '&' operator to specify that both conditions are required
movies.filter((pl.col("duration") >= 200) & (pl.col("genre") == "Drama"))

shape: (3, 6)

star_rating	title	content_rating	genre	duration	actors_list
f64	str	str	str	i64	str
8.7	"Seven Samurai"	"UNRATED"	"Drama"	207	"[u'Toshir\xf4 Mifune', u'Takas…
8.2	"Gone with the Wind"	"G"	"Drama"	238	"[u'Clark Gable', u'Vivien Leig…
7.8	"Hamlet"	"PG-13"	"Drama"	242	"[u'Kenneth Branagh', u'Julie C…

# using the '|' operator would have shown movies that are either long or dramas (or both)
movies.filter((pl.col("duration") >= 200) | (pl.col("genre") == "Drama"))

shape: (287, 6)

star_rating	title	content_rating	genre	duration	actors_list
f64	str	str	str	i64	str
9.1	"The Godfather: Part II"	"R"	"Crime"	200	"[u'Al Pacino', u'Robert De Nir…
8.9	"12 Angry Men"	"NOT RATED"	"Drama"	96	"[u'Henry Fonda', u'Lee J. Cobb…
8.9	"The Lord of the Rings: The Ret…	"PG-13"	"Adventure"	201	"[u'Elijah Wood', u'Viggo Morte…
8.9	"Fight Club"	"R"	"Drama"	139	"[u'Brad Pitt', u'Edward Norton…
8.8	"Forrest Gump"	"PG-13"	"Drama"	142	"[u'Tom Hanks', u'Robin Wright'…
…	…	…	…	…	…
7.4	"My Sister's Keeper"	"PG-13"	"Drama"	109	"[u'Cameron Diaz', u'Abigail Br…
7.4	"The English Patient"	"R"	"Drama"	162	"[u'Ralph Fiennes', u'Juliette …
7.4	"Wonder Boys"	"R"	"Drama"	107	"[u'Michael Douglas', u'Tobey M…
7.4	"Blue Valentine"	"NC-17"	"Drama"	112	"[u'Ryan Gosling', u'Michelle W…
7.4	"The Cider House Rules"	"PG-13"	"Drama"	126	"[u'Tobey Maguire', u'Charlize …

Range filter#

movies.filter(pl.col("genre").is_in(["Crime", "Drama", "Action"])).head()

shape: (5, 6)

star_rating	title	content_rating	genre	duration	actors_list
f64	str	str	str	i64	str
9.3	"The Shawshank Redemption"	"R"	"Crime"	142	"[u'Tim Robbins', u'Morgan Free…
9.2	"The Godfather"	"R"	"Crime"	175	"[u'Marlon Brando', u'Al Pacino…
9.1	"The Godfather: Part II"	"R"	"Crime"	200	"[u'Al Pacino', u'Robert De Nir…
9.0	"The Dark Knight"	"PG-13"	"Action"	152	"[u'Christian Bale', u'Heath Le…
8.9	"Pulp Fiction"	"R"	"Crime"	154	"[u'John Travolta', u'Uma Thurm…

movies.filter(pl.col("duration").is_between(100, 200, closed="none")).head()

shape: (5, 6)

star_rating	title	content_rating	genre	duration	actors_list
f64	str	str	str	i64	str
9.3	"The Shawshank Redemption"	"R"	"Crime"	142	"[u'Tim Robbins', u'Morgan Free…
9.2	"The Godfather"	"R"	"Crime"	175	"[u'Marlon Brando', u'Al Pacino…
9.0	"The Dark Knight"	"PG-13"	"Action"	152	"[u'Christian Bale', u'Heath Le…
8.9	"Pulp Fiction"	"R"	"Crime"	154	"[u'John Travolta', u'Uma Thurm…
8.9	"The Good, the Bad and the Ugly"	"NOT RATED"	"Western"	161	"[u'Clint Eastwood', u'Eli Wall…

Reshape#

drinks = pl.read_csv("data/drinks.csv")
drinks.head()

shape: (5, 6)

country	beer_servings	spirit_servings	wine_servings	total_litres_of_pure_alcohol	continent
str	i64	i64	i64	f64	str
"Afghanistan"	0	0	0	0.0	"Asia"
"Albania"	89	132	54	4.9	"Europe"
"Algeria"	25	0	14	0.7	"Africa"
"Andorra"	245	138	312	12.4	"Europe"
"Angola"	217	57	45	5.9	"Africa"

# apply the 'max' function along axis 0 to calculate the maximum value in each column
drinks[:, "beer_servings":"wine_servings"].max()

shape: (1, 3)

beer_servings	spirit_servings	wine_servings
i64	i64	i64
376	438	370

# convert every DataFrame element into a float
drinks[:, "beer_servings":"wine_servings"].cast(float).head()

shape: (5, 3)

beer_servings	spirit_servings	wine_servings
f64	f64	f64
0.0	0.0	0.0
89.0	132.0	54.0
25.0	0.0	14.0
245.0	138.0	312.0
217.0	57.0	45.0

Concat#

df1 = pl.DataFrame({
    "name": ["A", "B", "C", "D"],
    "math": [60, 89, 82, 70],
    "physics": [66, 95, 83, 66],
    "chemistry": [61, 91, 77, 70],
})

df2 = pl.DataFrame({
    "name": ["E", "F", "G", "H"],
    "math": [66, 95, 83, 66],
    "physics": [60, 89, 82, 70],
    "chemistry": [90, 81, 78, 90],
})

df12 = pl.concat([df1, df2])
df12

shape: (8, 4)

name	math	physics	chemistry
str	i64	i64	i64
"A"	60	66	61
"B"	89	95	91
"C"	82	83	77
"D"	70	66	70
"E"	66	60	90
"F"	95	89	81
"G"	83	82	78
"H"	66	70	90

df12.select(sorted(df12.columns))

shape: (8, 4)

chemistry	math	name	physics
i64	i64	str	i64
61	60	"A"	66
91	89	"B"	95
77	82	"C"	83
70	70	"D"	66
90	66	"E"	60
81	95	"F"	89
78	83	"G"	82
90	66	"H"	70

Join#

movie_cols = ["movie_id", "title", "date"]
movies = pl.read_csv("data/u.item", separator="|", has_header=False)[:, :3]
movies.columns = movie_cols
movies.head()

shape: (5, 3)

movie_id	title	date
i64	str	str
1	"Toy Story (1995)"	"01-Jan-1995"
2	"GoldenEye (1995)"	"01-Jan-1995"
3	"Four Rooms (1995)"	"01-Jan-1995"
4	"Get Shorty (1995)"	"01-Jan-1995"
5	"Copycat (1995)"	"01-Jan-1995"

movies.shape

(1682, 3)

rating_cols = ["user_id", "movie_id", "rating", "timestamp"]

ratings = pl.read_csv("data/u.data", separator="\t", has_header=False)
ratings.columns = rating_cols
ratings.head()

shape: (5, 4)

user_id	movie_id	rating	timestamp
i64	i64	i64	i64
196	242	3	881250949
186	302	3	891717742
22	377	1	878887116
244	51	2	880606923
166	346	1	886397596

ratings.shape

(100000, 4)

ratings.filter(pl.col("movie_id") == 1).head()

shape: (5, 4)

user_id	movie_id	rating	timestamp
i64	i64	i64	i64
308	1	4	887736532
287	1	5	875334088
148	1	4	877019411
280	1	4	891700426
66	1	3	883601324

# Returns rows that have matching values in both tables
movies.join(ratings, on="movie_id").head()

shape: (5, 6)

movie_id	title	date	user_id	rating	timestamp
i64	str	str	i64	i64	i64
242	"Kolya (1996)"	"24-Jan-1997"	196	3	881250949
302	"L.A. Confidential (1997)"	"01-Jan-1997"	186	3	891717742
377	"Heavyweights (1994)"	"01-Jan-1994"	22	1	878887116
51	"Legends of the Fall (1994)"	"01-Jan-1994"	244	2	880606923
346	"Jackie Brown (1997)"	"01-Jan-1997"	166	1	886397596

# Returns all rows from the left table, and the matched rows from the right table
movies.join(ratings, on="movie_id", how="left").head()

shape: (5, 6)

movie_id	title	date	user_id	rating	timestamp
i64	str	str	i64	i64	i64
1	"Toy Story (1995)"	"01-Jan-1995"	308	4	887736532
1	"Toy Story (1995)"	"01-Jan-1995"	287	5	875334088
1	"Toy Story (1995)"	"01-Jan-1995"	148	4	877019411
1	"Toy Story (1995)"	"01-Jan-1995"	280	4	891700426
1	"Toy Story (1995)"	"01-Jan-1995"	66	3	883601324

# Returns all rows from the right table, and the matched rows from the left table
movies.join(ratings, on="movie_id", how="right").head()

shape: (5, 6)

title	date	user_id	movie_id	rating	timestamp
str	str	i64	i64	i64	i64
"Kolya (1996)"	"24-Jan-1997"	196	242	3	881250949
"L.A. Confidential (1997)"	"01-Jan-1997"	186	302	3	891717742
"Heavyweights (1994)"	"01-Jan-1994"	22	377	1	878887116
"Legends of the Fall (1994)"	"01-Jan-1994"	244	51	2	880606923
"Jackie Brown (1997)"	"01-Jan-1997"	166	346	1	886397596

# Returns all rows when there is a match in either left or right table
movies.join(ratings, on="movie_id", how="full").head()

shape: (5, 7)

movie_id	title	date	user_id	movie_id_right	rating	timestamp
i64	str	str	i64	i64	i64	i64
242	"Kolya (1996)"	"24-Jan-1997"	196	242	3	881250949
302	"L.A. Confidential (1997)"	"01-Jan-1997"	186	302	3	891717742
377	"Heavyweights (1994)"	"01-Jan-1994"	22	377	1	878887116
51	"Legends of the Fall (1994)"	"01-Jan-1994"	244	51	2	880606923
346	"Jackie Brown (1997)"	"01-Jan-1997"	166	346	1	886397596

# equals to inner join
movies.join(ratings, left_on="movie_id", right_on="movie_id").head()

shape: (5, 6)

movie_id	title	date	user_id	rating	timestamp
i64	str	str	i64	i64	i64
242	"Kolya (1996)"	"24-Jan-1997"	196	3	881250949
302	"L.A. Confidential (1997)"	"01-Jan-1997"	186	3	891717742
377	"Heavyweights (1994)"	"01-Jan-1994"	22	1	878887116
51	"Legends of the Fall (1994)"	"01-Jan-1994"	244	2	880606923
346	"Jackie Brown (1997)"	"01-Jan-1997"	166	1	886397596

MultiIndexes#

# close = pl.read_csv(
#     "data/stocks.csv", usecols=[0, 1, 3], index_col=["Symbol", "Date"]
# ).sort_index()
# close

# volume = pl.read_csv(
#     "data/stocks.csv", usecols=[0, 2, 3], index_col=["Symbol", "Date"]
# ).sort_index()
# volume

Explode#

orders = pl.read_csv("data/chipotle.tsv", separator="\t")
orders.head()

shape: (5, 5)

order_id	quantity	item_name	choice_description	item_price
i64	i64	str	str	str
1	1	"Chips and Fresh Tomato Salsa"	"NULL"	"$2.39"
1	1	"Izze"	"[Clementine]"	"$3.39"
1	1	"Nantucket Nectar"	"[Apple]"	"$3.39"
1	1	"Chips and Tomatillo-Green Chil…	"NULL"	"$2.39"
2	2	"Chicken Bowl"	"[Tomatillo-Red Chili Salsa (Ho…	"$16.98"

orders2 = orders.with_columns(pl.col("item_name").str.split(" "))
orders2.head()

shape: (5, 5)

order_id	quantity	item_name	choice_description	item_price
i64	i64	list[str]	str	str
1	1	["Chips", "and", … "Salsa"]	"NULL"	"$2.39"
1	1	["Izze"]	"[Clementine]"	"$3.39"
1	1	["Nantucket", "Nectar"]	"[Apple]"	"$3.39"
1	1	["Chips", "and", … "Salsa"]	"NULL"	"$2.39"
2	2	["Chicken", "Bowl"]	"[Tomatillo-Red Chili Salsa (Ho…	"$16.98"

orders2.explode("item_name")

shape: (11_501, 5)

order_id	quantity	item_name	choice_description	item_price
i64	i64	str	str	str
1	1	"Chips"	"NULL"	"$2.39"
1	1	"and"	"NULL"	"$2.39"
1	1	"Fresh"	"NULL"	"$2.39"
1	1	"Tomato"	"NULL"	"$2.39"
1	1	"Salsa"	"NULL"	"$2.39"
…	…	…	…	…
1834	1	"Salad"	"[Fresh Tomato Salsa, [Fajita V…	"$8.75"
1834	1	"Bowl"	"[Fresh Tomato Salsa, [Fajita V…	"$8.75"
1834	1	"Chicken"	"[Fresh Tomato Salsa, [Fajita V…	"$8.75"
1834	1	"Salad"	"[Fresh Tomato Salsa, [Fajita V…	"$8.75"
1834	1	"Bowl"	"[Fresh Tomato Salsa, [Fajita V…	"$8.75"

beer_servings	spirit_servings	wine_servings
i64	i64	i64
0	0	0
89	132	54
25	0	14
245	138	312
217	57	45
…	…	…
333	100	3
111	2	1
6	0	0
32	19	4
64	18	4

name	math	physics	chemistry
str	i64	i64	i64
"A"	60	66	61
"B"	89	95	91
"C"	82	83	77
"D"	70	66	70
"E"	66	60	90
"F"	95	89	81
"G"	83	82	78
"H"	66	70	90

chemistry	math	name	physics
i64	i64	str	i64
61	60	"A"	66
91	89	"B"	95
77	82	"C"	83
70	70	"D"	66
90	66	"E"	60
81	95	"F"	89
78	83	"G"	82
90	66	"H"	70

beer_servings	spirit_servings	wine_servings
i64	i64	i64
0	0	0
89	132	54
25	0	14
245	138	312
217	57	45
…	…	…
333	100	3
111	2	1
6	0	0
32	19	4
64	18	4

name	math	physics	chemistry
str	i64	i64	i64
"A"	60	66	61
"B"	89	95	91
"C"	82	83	77
"D"	70	66	70
"E"	66	60	90
"F"	95	89	81
"G"	83	82	78
"H"	66	70	90

chemistry	math	name	physics
i64	i64	str	i64
61	60	"A"	66
91	89	"B"	95
77	82	"C"	83
70	70	"D"	66
90	66	"E"	60
81	95	"F"	89
78	83	"G"	82
90	66	"H"	70

读写和过滤

Contents

读写和过滤#

Read & Info#

Read tabular data#

Describe data#

Column Manipulation#

Create a new column#

Rename columns#

Remove columns#

Remove rows#

Select#

Iterate#

Index#

Select a column#

Select numeric columns#

Select rows and columns#

Filter#

Column filter#

Index filter#

Value filter#

Range filter#

Reshape#

Concat#

Join#

MultiIndexes#

Explode#

beer_servings	spirit_servings	wine_servings
i64	i64	i64
0	0	0
89	132	54
25	0	14
245	138	312
217	57	45
…	…	…
333	100	3
111	2	1
6	0	0
32	19	4
64	18	4

name	math	physics	chemistry
str	i64	i64	i64
"A"	60	66	61
"B"	89	95	91
"C"	82	83	77
"D"	70	66	70
"E"	66	60	90
"F"	95	89	81
"G"	83	82	78
"H"	66	70	90

chemistry	math	name	physics
i64	i64	str	i64
61	60	"A"	66
91	89	"B"	95
77	82	"C"	83
70	70	"D"	66
90	66	"E"	60
81	95	"F"	89
78	83	"G"	82
90	66	"H"	70