Title: | More Efficient Tidyverse Code, Using Polars in the Background |
---|---|
Description: | Polars is a cross-language tool for manipulating very large data. However, one drawback is that the R implementation has a syntax that will look odd to many R users who are not used to Python syntax. The objective of tidypolars is to improve the ease-of-use of Polars in R by providing tidyverse syntax to polars. |
Authors: | Etienne Bacher [aut, cre, cph], Anatoly Tsyplenkov [ctb] (ORCID: <https://orcid.org/0000-0003-4144-8402>) |
Maintainer: | Etienne Bacher <[email protected]> |
License: | MIT + file LICENSE |
Version: | 0.14.1 |
Built: | 2025-08-12 19:02:14 UTC |
Source: | https://github.com/etiennebacher/tidypolars |
Order rows using column values
## S3 method for class 'polars_data_frame' arrange(.data, ..., .by_group = FALSE)
## S3 method for class 'polars_data_frame' arrange(.data, ..., .by_group = FALSE)
.data |
A Polars Data/LazyFrame |
... |
Variables, or functions of variables. Use |
.by_group |
If |
pl_test <- polars::pl$DataFrame( x1 = c("a", "a", "b", "a", "c"), x2 = c(2, 1, 5, 3, 1), value = sample(1:5) ) arrange(pl_test, x1) arrange(pl_test, x1, -x2) # if the data is grouped, you need to specify `.by_group = TRUE` to sort by # the groups first pl_test |> group_by(x1) |> arrange(-x2, .by_group = TRUE)
pl_test <- polars::pl$DataFrame( x1 = c("a", "a", "b", "a", "c"), x2 = c(2, 1, 5, 3, 1), value = sample(1:5) ) arrange(pl_test, x1) arrange(pl_test, x1, -x2) # if the data is grouped, you need to specify `.by_group = TRUE` to sort by # the groups first pl_test |> group_by(x1) |> arrange(-x2, .by_group = TRUE)
Append multiple Data/LazyFrames next to each other
bind_cols_polars(..., .name_repair = "unique")
bind_cols_polars(..., .name_repair = "unique")
... |
Polars DataFrames or LazyFrames to combine. Each argument can either be a Data/LazyFrame, or a list of Data/LazyFrames. Columns are matched by name. All Data/LazyFrames must have the same number of rows and there mustn't be duplicated column names. |
.name_repair |
Can be |
p1 <- polars::pl$DataFrame( x = sample(letters, 20), y = sample(1:100, 20) ) p2 <- polars::pl$DataFrame( z = sample(letters, 20), w = sample(1:100, 20) ) bind_cols_polars(p1, p2) bind_cols_polars(list(p1, p2))
p1 <- polars::pl$DataFrame( x = sample(letters, 20), y = sample(1:100, 20) ) p2 <- polars::pl$DataFrame( z = sample(letters, 20), w = sample(1:100, 20) ) bind_cols_polars(p1, p2) bind_cols_polars(list(p1, p2))
Stack multiple Data/LazyFrames on top of each other
bind_rows_polars(..., .id = NULL)
bind_rows_polars(..., .id = NULL)
... |
Polars DataFrames or LazyFrames to combine. Each argument can
either be a Data/LazyFrame, or a list of Data/LazyFrames. Columns are matched
by name, and any missing columns will be filled with |
.id |
The name of an optional identifier column. Provide a string to
create an output column that identifies each input. If all elements in
|
library(polars) p1 <- pl$DataFrame( x = c("a", "b"), y = 1:2 ) p2 <- pl$DataFrame( y = 3:4, z = c("c", "d") )$with_columns(pl$col("y")$cast(pl$Int16)) bind_rows_polars(p1, p2) # this is equivalent bind_rows_polars(list(p1, p2)) # create an id colum bind_rows_polars(p1, p2, .id = "id") # create an id colum with named elements bind_rows_polars(p1 = p1, p2 = p2, .id = "id")
library(polars) p1 <- pl$DataFrame( x = c("a", "b"), y = 1:2 ) p2 <- pl$DataFrame( y = 3:4, z = c("c", "d") )$with_columns(pl$col("y")$cast(pl$Int16)) bind_rows_polars(p1, p2) # this is equivalent bind_rows_polars(list(p1, p2)) # create an id colum bind_rows_polars(p1, p2, .id = "id") # create an id colum with named elements bind_rows_polars(p1 = p1, p2 = p2, .id = "id")
Turns implicit missing values into explicit missing values. This is useful for completing missing combinations of data.
## S3 method for class 'polars_data_frame' complete(data, ..., fill = list(), explicit = TRUE) ## S3 method for class 'polars_lazy_frame' complete(data, ..., fill = list(), explicit = TRUE)
## S3 method for class 'polars_data_frame' complete(data, ..., fill = list(), explicit = TRUE) ## S3 method for class 'polars_lazy_frame' complete(data, ..., fill = list(), explicit = TRUE)
data |
A Polars Data/LazyFrame |
... |
Any expression accepted by When used with continuous variables, you may need to fill in values that do
not appear in the data: to do so use expressions like |
fill |
A named list that for each variable supplies a single value to
use instead of |
explicit |
Should both implicit (newly created) and explicit
(pre-existing) missing values be filled by |
df <- polars::pl$DataFrame( group = c(1:2, 1, 2), item_id = c(1:2, 2, 3), item_name = c("a", "a", "b", "b"), value1 = c(1, NA, 3, 4), value2 = 4:7 ) df df |> complete(group, item_id, item_name) # Use `fill` to replace NAs with some value. By default, affects both new # (implicit) and pre-existing (explicit) missing values. df |> complete( group, item_id, item_name, fill = list(value1 = 0, value2 = 99) ) # Limit the fill to only the newly created (i.e. previously implicit) # missing values with `explicit = FALSE` df |> complete( group, item_id, item_name, fill = list(value1 = 0, value2 = 99), explicit = FALSE ) df |> group_by(group, maintain_order = TRUE) |> complete(item_id, item_name)
df <- polars::pl$DataFrame( group = c(1:2, 1, 2), item_id = c(1:2, 2, 3), item_name = c("a", "a", "b", "b"), value1 = c(1, NA, 3, 4), value2 = 4:7 ) df df |> complete(group, item_id, item_name) # Use `fill` to replace NAs with some value. By default, affects both new # (implicit) and pre-existing (explicit) missing values. df |> complete( group, item_id, item_name, fill = list(value1 = 0, value2 = 99) ) # Limit the fill to only the newly created (i.e. previously implicit) # missing values with `explicit = FALSE` df |> complete( group, item_id, item_name, fill = list(value1 = 0, value2 = 99), explicit = FALSE ) df |> group_by(group, maintain_order = TRUE) |> complete(item_id, item_name)
compute()
checks the query, optimizes it in the background, and runs it.
The output is a Polars DataFrame. collect()
is
similar to compute()
but converts the output to an R data.frame, which
consumes more memory.
Until tidypolars
0.7.0, there was only collect()
and it was used to
collect a LazyFrame into a Polars DataFrame. This usage is still valid for
now but will change in 0.8.0 to automatically convert a DataFrame to a
data.frame
. Use compute()
to have a Polars DataFrame as output.
## S3 method for class 'polars_lazy_frame' compute( x, ..., type_coercion = TRUE, predicate_pushdown = TRUE, projection_pushdown = TRUE, simplify_expression = TRUE, slice_pushdown = TRUE, comm_subplan_elim = TRUE, comm_subexpr_elim = TRUE, cluster_with_columns = TRUE, no_optimization = FALSE, engine = c("auto", "in-memory", "streaming"), streaming = FALSE ) ## S3 method for class 'polars_lazy_frame' collect( x, ..., type_coercion = TRUE, predicate_pushdown = TRUE, projection_pushdown = TRUE, simplify_expression = TRUE, slice_pushdown = TRUE, comm_subplan_elim = TRUE, comm_subexpr_elim = TRUE, cluster_with_columns = TRUE, no_optimization = FALSE, engine = c("auto", "in-memory", "streaming"), streaming = FALSE, .name_repair = "check_unique", uint8 = "integer", int64 = "double", date = "Date", time = "hms", decimal = "double", as_clock_class = FALSE, ambiguous = "raise", non_existent = "raise" )
## S3 method for class 'polars_lazy_frame' compute( x, ..., type_coercion = TRUE, predicate_pushdown = TRUE, projection_pushdown = TRUE, simplify_expression = TRUE, slice_pushdown = TRUE, comm_subplan_elim = TRUE, comm_subexpr_elim = TRUE, cluster_with_columns = TRUE, no_optimization = FALSE, engine = c("auto", "in-memory", "streaming"), streaming = FALSE ) ## S3 method for class 'polars_lazy_frame' collect( x, ..., type_coercion = TRUE, predicate_pushdown = TRUE, projection_pushdown = TRUE, simplify_expression = TRUE, slice_pushdown = TRUE, comm_subplan_elim = TRUE, comm_subexpr_elim = TRUE, cluster_with_columns = TRUE, no_optimization = FALSE, engine = c("auto", "in-memory", "streaming"), streaming = FALSE, .name_repair = "check_unique", uint8 = "integer", int64 = "double", date = "Date", time = "hms", decimal = "double", as_clock_class = FALSE, ambiguous = "raise", non_existent = "raise" )
x |
A Polars LazyFrame |
... |
Dots which should be empty. |
type_coercion |
Coerce types such that operations succeed and run on
minimal required memory (default is |
predicate_pushdown |
Applies filters as early as possible at scan level
(default is |
projection_pushdown |
Select only the columns that are needed at the scan
level (default is |
simplify_expression |
Various optimizations, such as constant folding
and replacing expensive operations with faster alternatives (default is
|
slice_pushdown |
Only load the required slice from the scan. Don't
materialize sliced outputs level. Don't materialize sliced outputs (default
is |
comm_subplan_elim |
Cache branching subplans that occur on self-joins or
unions (default is |
comm_subexpr_elim |
Cache common subexpressions (default is |
cluster_with_columns |
Combine sequential independent calls to
|
no_optimization |
Sets the following optimizations to |
engine |
The engine name to use for processing the query. One of the followings:
|
streaming |
|
.name_repair , uint8 , int64 , date , time , decimal , as_clock_class , ambiguous , non_existent
|
Parameters to control the conversion from polars types to R. See
|
fetch()
for applying a lazy query on a subset of the data.
dat_lazy <- polars::as_polars_df(iris)$lazy() compute(dat_lazy) # you can build a query and add compute() as the last piece dat_lazy |> select(starts_with("Sepal")) |> filter(between(Sepal.Length, 5, 6)) |> compute() # call collect() instead to return a data.frame (note that this is more # expensive than compute()) dat_lazy |> select(starts_with("Sepal")) |> filter(between(Sepal.Length, 5, 6)) |> collect()
dat_lazy <- polars::as_polars_df(iris)$lazy() compute(dat_lazy) # you can build a query and add compute() as the last piece dat_lazy |> select(starts_with("Sepal")) |> filter(between(Sepal.Length, 5, 6)) |> compute() # call collect() instead to return a data.frame (note that this is more # expensive than compute()) dat_lazy |> select(starts_with("Sepal")) |> filter(between(Sepal.Length, 5, 6)) |> collect()
Count the observations in each group
## S3 method for class 'polars_data_frame' count(x, ..., wt = NULL, sort = FALSE, name = "n") ## S3 method for class 'polars_data_frame' tally(x, wt = NULL, sort = FALSE, name = "n") ## S3 method for class 'polars_lazy_frame' count(x, ..., wt = NULL, sort = FALSE, name = "n") ## S3 method for class 'polars_lazy_frame' tally(x, wt = NULL, sort = FALSE, name = "n") ## S3 method for class 'polars_data_frame' add_count(x, ..., wt = NULL, sort = FALSE, name = "n") ## S3 method for class 'polars_lazy_frame' add_count(x, ..., wt = NULL, sort = FALSE, name = "n")
## S3 method for class 'polars_data_frame' count(x, ..., wt = NULL, sort = FALSE, name = "n") ## S3 method for class 'polars_data_frame' tally(x, wt = NULL, sort = FALSE, name = "n") ## S3 method for class 'polars_lazy_frame' count(x, ..., wt = NULL, sort = FALSE, name = "n") ## S3 method for class 'polars_lazy_frame' tally(x, wt = NULL, sort = FALSE, name = "n") ## S3 method for class 'polars_data_frame' add_count(x, ..., wt = NULL, sort = FALSE, name = "n") ## S3 method for class 'polars_lazy_frame' add_count(x, ..., wt = NULL, sort = FALSE, name = "n")
x |
A Polars Data/LazyFrame |
... |
Any expression accepted by |
wt |
Not supported by tidypolars. |
sort |
If |
name |
Name of the new column. |
test <- polars::as_polars_df(mtcars) # grouping variables must be specified in count() and add_count() count(test, cyl) count(test, cyl, am) count(test, cyl, am, sort = TRUE, name = "count") add_count(test, cyl, am, sort = TRUE, name = "count") # tally() directly uses grouping variables of the input test |> group_by(cyl) |> tally() test |> group_by(cyl, am) |> tally(sort = TRUE, name = "count")
test <- polars::as_polars_df(mtcars) # grouping variables must be specified in count() and add_count() count(test, cyl) count(test, cyl, am) count(test, cyl, am, sort = TRUE, name = "count") add_count(test, cyl, am, sort = TRUE, name = "count") # tally() directly uses grouping variables of the input test |> group_by(cyl) |> tally() test |> group_by(cyl, am) |> tally(sort = TRUE, name = "count")
Cross joins match each row in x
to every row in y
, resulting in a dataset
with nrow(x) * nrow(y)
rows.
## S3 method for class 'polars_data_frame' cross_join(x, y, ..., suffix = c(".x", ".y")) ## S3 method for class 'polars_lazy_frame' cross_join(x, y, ..., suffix = c(".x", ".y"))
## S3 method for class 'polars_data_frame' cross_join(x, y, ..., suffix = c(".x", ".y")) ## S3 method for class 'polars_lazy_frame' cross_join(x, y, ..., suffix = c(".x", ".y"))
x , y
|
Two Polars Data/LazyFrames |
... |
Dots which should be empty. |
suffix |
If there are non-joined duplicate variables in |
Arguments that are supported by the original implementation in the tidyverse
but are not listed above will throw a warning by default if they are
specified. To change this behavior to error instead, use
options(tidypolars_unknown_args = "error")
.
test <- polars::pl$DataFrame( origin = c("ALG", "FRA", "GER"), year = c(2020, 2020, 2021) ) test2 <- polars::pl$DataFrame( destination = c("USA", "JPN", "BRA"), language = c("english", "japanese", "portuguese") ) test test2 cross_join(test, test2)
test <- polars::pl$DataFrame( origin = c("ALG", "FRA", "GER"), year = c(2020, 2020, 2021) ) test2 <- polars::pl$DataFrame( destination = c("USA", "JPN", "BRA"), language = c("english", "japanese", "portuguese") ) test test2 cross_join(test, test2)
This function is deprecated as of tidypolars 0.10.0, it will be removed in
a future update. Use summary()
with the same arguments instead.
describe(.data, percentiles = c(0.25, 0.5, 0.75))
describe(.data, percentiles = c(0.25, 0.5, 0.75))
.data |
A Polars DataFrame. |
percentiles |
One or more percentiles to include in the summary
statistics. All values must be between 0 and 1 ( |
Those functions are deprecated as of tidypolars 0.10.0, they will be removed
in a future update. Use explain()
with optimized = FALSE
to recover the
output of describe_plan()
, and with optimized = TRUE
(the default) to
get the output of describe_optimized_plan()
.
describe_plan(.data) describe_optimized_plan(.data)
describe_plan(.data) describe_optimized_plan(.data)
.data |
A Polars LazyFrame |
By default, duplicates are looked for in all variables. It is possible to specify a subset of variables where duplicates should be looked for. It is also possible to keep either the first occurrence, the last occurence or remove all duplicates.
## S3 method for class 'polars_data_frame' distinct(.data, ..., keep = "first", maintain_order = TRUE) ## S3 method for class 'polars_lazy_frame' distinct(.data, ..., keep = "first", maintain_order = TRUE) duplicated_rows(.data, ...)
## S3 method for class 'polars_data_frame' distinct(.data, ..., keep = "first", maintain_order = TRUE) ## S3 method for class 'polars_lazy_frame' distinct(.data, ..., keep = "first", maintain_order = TRUE) duplicated_rows(.data, ...)
.data |
A Polars Data/LazyFrame |
... |
Any expression accepted by |
keep |
Either "first" (keep the first occurrence of the duplicated row), "last" (last occurrence) or "none" (remove all ofccurences of duplicated rows). |
maintain_order |
Maintain row order. This is the default but it can slow down the process with large datasets and it prevents the use of streaming. |
pl_test <- polars::pl$DataFrame( iso_o = c(rep(c("AA", "AB"), each = 2), "AC", "DC"), iso_d = rep(c("BA", "BB", "BC"), each = 2), value = c(2, 2, 3, 4, 5, 6) ) distinct(pl_test) distinct(pl_test, iso_o) duplicated_rows(pl_test) duplicated_rows(pl_test, iso_o, iso_d)
pl_test <- polars::pl$DataFrame( iso_o = c(rep(c("AA", "AB"), each = 2), "AC", "DC"), iso_d = rep(c("BA", "BB", "BC"), each = 2), value = c(2, 2, 3, 4, 5, 6) ) distinct(pl_test) distinct(pl_test, iso_o) duplicated_rows(pl_test) duplicated_rows(pl_test, iso_o, iso_d)
By default, this will drop rows that contain any missing values. It is possible to specify a subset of variables so that only missing values in these variables will be considered.
## S3 method for class 'polars_data_frame' drop_na(data, ...) ## S3 method for class 'polars_lazy_frame' drop_na(data, ...)
## S3 method for class 'polars_data_frame' drop_na(data, ...) ## S3 method for class 'polars_lazy_frame' drop_na(data, ...)
data |
A Polars Data/LazyFrame |
... |
Any expression accepted by |
tmp <- mtcars tmp[1:3, "mpg"] <- NA tmp[4, "hp"] <- NA pl_tmp <- as_polars_df(tmp) drop_na(pl_tmp) drop_na(pl_tmp, hp, mpg)
tmp <- mtcars tmp[1:3, "mpg"] <- NA tmp[4, "hp"] <- NA pl_tmp <- as_polars_df(tmp) drop_na(pl_tmp) drop_na(pl_tmp, hp, mpg)
This function is available for LazyFrame
s only.
By default, explain()
shows the query plan that is optimized and then run
by Polars. Setting optimized = FALSE
shows the query plan as-is, without
any optimization done, but this is not the query performed. Note that the
plans are read from bottom to top.
## S3 method for class 'polars_lazy_frame' explain(x, optimized = TRUE, ...)
## S3 method for class 'polars_lazy_frame' explain(x, optimized = TRUE, ...)
x |
A Polars LazyFrame. |
optimized |
Logical. If |
... |
Ignored. |
query <- mtcars |> as_polars_lf() |> arrange(drat) |> filter(cyl == 3) |> select(mpg) # unoptimized query plan: no_opt <- explain(query, optimized = FALSE) no_opt # better printing with cat(): cat(no_opt) # optimized query run by polars cat(explain(query))
query <- mtcars |> as_polars_lf() |> arrange(drat) |> filter(cyl == 3) |> select(mpg) # unoptimized query plan: no_opt <- explain(query, optimized = FALSE) no_opt # better printing with cat(): cat(no_opt) # optimized query run by polars cat(explain(query))
n
rows of a LazyFrameUse head()
before collect()
to only get a subset of the data.
fetch( .data, n_rows = 500, type_coercion = TRUE, predicate_pushdown = TRUE, projection_pushdown = TRUE, simplify_expression = TRUE, slice_pushdown = TRUE, comm_subplan_elim = TRUE, comm_subexpr_elim = TRUE, cluster_with_columns = TRUE, no_optimization = FALSE, engine = c("auto", "in-memory", "streaming"), streaming = FALSE )
fetch( .data, n_rows = 500, type_coercion = TRUE, predicate_pushdown = TRUE, projection_pushdown = TRUE, simplify_expression = TRUE, slice_pushdown = TRUE, comm_subplan_elim = TRUE, comm_subexpr_elim = TRUE, cluster_with_columns = TRUE, no_optimization = FALSE, engine = c("auto", "in-memory", "streaming"), streaming = FALSE )
.data |
A Polars LazyFrame |
n_rows |
Number of rows to fetch. |
type_coercion |
Coerce types such that operations succeed and run on
minimal required memory (default is |
predicate_pushdown |
Applies filters as early as possible at scan level
(default is |
projection_pushdown |
Select only the columns that are needed at the scan
level (default is |
simplify_expression |
Various optimizations, such as constant folding
and replacing expensive operations with faster alternatives (default is
|
slice_pushdown |
Only load the required slice from the scan. Don't
materialize sliced outputs level. Don't materialize sliced outputs (default
is |
comm_subplan_elim |
Cache branching subplans that occur on self-joins or
unions (default is |
comm_subexpr_elim |
Cache common subexpressions (default is |
cluster_with_columns |
Combine sequential independent calls to
|
no_optimization |
Sets the following optimizations to |
engine |
The engine name to use for processing the query. One of the followings:
|
streaming |
The parameter n_rows
indicates how many rows from the LazyFrame should be
used at the beginning of the query, but it doesn't guarantee that n_rows
will
be returned. For example, if the query contains a filter or join operations
with other datasets, then the final number of rows can be lower than n_rows
.
On the other hand, appending some rows during the query can lead to an output
that has more rows than n_rows
.
collect()
for applying a lazy query on the full data.
Fills missing values in selected columns using the next or previous entry. This is useful in the common output format where values are not repeated, and are only recorded when they change.
## S3 method for class 'polars_data_frame' fill(data, ..., .direction = c("down", "up", "downup", "updown"))
## S3 method for class 'polars_data_frame' fill(data, ..., .direction = c("down", "up", "downup", "updown"))
data |
A Polars Data/LazyFrame |
... |
Any expression accepted by |
.direction |
Direction in which to fill missing values. Either "down" (the default), "up", "downup" (i.e. first down and then up) or "updown" (first up and then down). |
With grouped Data/LazyFrames, fill() will be applied within each group, meaning that it won't fill across group boundaries.
pl_test <- polars::pl$DataFrame(x = c(NA, 1), y = c(2, NA)) fill(pl_test, everything(), .direction = "down") fill(pl_test, everything(), .direction = "up") # with grouped data, it doesn't use values from other groups pl_grouped <- polars::pl$DataFrame( grp = rep(c("A", "B"), each = 3), x = c(1, NA, NA, NA, 2, NA), y = c(3, NA, 4, NA, 3, 1) ) |> group_by(grp) fill(pl_grouped, x, y, .direction = "down")
pl_test <- polars::pl$DataFrame(x = c(NA, 1), y = c(2, NA)) fill(pl_test, everything(), .direction = "down") fill(pl_test, everything(), .direction = "up") # with grouped data, it doesn't use values from other groups pl_grouped <- polars::pl$DataFrame( grp = rep(c("A", "B"), each = 3), x = c(1, NA, NA, NA, 2, NA), y = c(3, NA, 4, NA, 3, 1) ) |> group_by(grp) fill(pl_grouped, x, y, .direction = "down")
This function is used to subset a data frame, retaining all rows that satisfy
your conditions. To be retained, the row must produce a value of TRUE for all
conditions. Note that when a condition evaluates to NA the row will be
dropped, unlike base subsetting with [
.
## S3 method for class 'polars_data_frame' filter(.data, ..., .by = NULL) ## S3 method for class 'polars_lazy_frame' filter(.data, ..., .by = NULL)
## S3 method for class 'polars_data_frame' filter(.data, ..., .by = NULL) ## S3 method for class 'polars_lazy_frame' filter(.data, ..., .by = NULL)
.data |
A Polars Data/LazyFrame |
... |
Expressions that return a logical value, and are defined in terms
of the variables in the data. If multiple expressions are included, they
will be combined with the & operator. Only rows for which all conditions
evaluate to |
.by |
Optionally, a selection of columns to group by for just this
operation, functioning as an alternative to |
pl_iris <- polars::as_polars_df(iris) filter(pl_iris, Sepal.Length < 5, Species == "setosa") filter(pl_iris, Sepal.Length < Sepal.Width + Petal.Length) filter(pl_iris, Species == "setosa" | is.na(Species)) iris2 <- iris iris2$Species <- as.character(iris2$Species) iris2 |> as_polars_df() |> filter(Species %in% c("setosa", "virginica")) # filter by group pl_iris |> group_by(Species) |> filter(Sepal.Length == max(Sepal.Length)) |> ungroup() # an alternative syntax for grouping is to use `.by` pl_iris |> filter(Sepal.Length == max(Sepal.Length), .by = Species)
pl_iris <- polars::as_polars_df(iris) filter(pl_iris, Sepal.Length < 5, Species == "setosa") filter(pl_iris, Sepal.Length < Sepal.Width + Petal.Length) filter(pl_iris, Species == "setosa" | is.na(Species)) iris2 <- iris iris2$Species <- as.character(iris2$Species) iris2 |> as_polars_df() |> filter(Species %in% c("setosa", "virginica")) # filter by group pl_iris |> group_by(Species) |> filter(Sepal.Length == max(Sepal.Length)) |> ungroup() # an alternative syntax for grouping is to use `.by` pl_iris |> filter(Sepal.Length == max(Sepal.Length), .by = Species)
read_csv_polars()
imports the data as a Polars DataFrame.
scan_csv_polars()
imports the data as a Polars LazyFrame.
read_csv_polars( source, ..., has_header = TRUE, separator = ",", comment_prefix = NULL, quote_char = "\"", skip_rows = 0, schema = NULL, schema_overrides = NULL, null_values = NULL, ignore_errors = FALSE, cache = FALSE, infer_schema_length = 100, n_rows = NULL, encoding = "utf8", low_memory = FALSE, rechunk = TRUE, skip_rows_after_header = 0, row_index_name = NULL, row_index_offset = 0, try_parse_dates = FALSE, eol_char = "\n", raise_if_empty = TRUE, truncate_ragged_lines = FALSE, include_file_paths = NULL, dtypes, reuse_downloaded ) scan_csv_polars( source, ..., has_header = TRUE, separator = ",", comment_prefix = NULL, quote_char = "\"", skip_rows = 0, schema = NULL, schema_overrides = NULL, null_values = NULL, ignore_errors = FALSE, cache = FALSE, infer_schema_length = 100, n_rows = NULL, encoding = "utf8", low_memory = FALSE, rechunk = TRUE, skip_rows_after_header = 0, row_index_name = NULL, row_index_offset = 0, try_parse_dates = FALSE, eol_char = "\n", raise_if_empty = TRUE, truncate_ragged_lines = FALSE, include_file_paths = NULL, dtypes, reuse_downloaded )
read_csv_polars( source, ..., has_header = TRUE, separator = ",", comment_prefix = NULL, quote_char = "\"", skip_rows = 0, schema = NULL, schema_overrides = NULL, null_values = NULL, ignore_errors = FALSE, cache = FALSE, infer_schema_length = 100, n_rows = NULL, encoding = "utf8", low_memory = FALSE, rechunk = TRUE, skip_rows_after_header = 0, row_index_name = NULL, row_index_offset = 0, try_parse_dates = FALSE, eol_char = "\n", raise_if_empty = TRUE, truncate_ragged_lines = FALSE, include_file_paths = NULL, dtypes, reuse_downloaded ) scan_csv_polars( source, ..., has_header = TRUE, separator = ",", comment_prefix = NULL, quote_char = "\"", skip_rows = 0, schema = NULL, schema_overrides = NULL, null_values = NULL, ignore_errors = FALSE, cache = FALSE, infer_schema_length = 100, n_rows = NULL, encoding = "utf8", low_memory = FALSE, rechunk = TRUE, skip_rows_after_header = 0, row_index_name = NULL, row_index_offset = 0, try_parse_dates = FALSE, eol_char = "\n", raise_if_empty = TRUE, truncate_ragged_lines = FALSE, include_file_paths = NULL, dtypes, reuse_downloaded )
source |
Path(s) to a file or directory. When needing to authenticate
for scanning cloud locations, see the |
... |
These dots are for future extensions and must be empty. |
has_header |
Indicate if the first row of dataset is a header or not.If
|
separator |
Single byte character to use as separator in the file. |
comment_prefix |
A string, which can be up to 5 symbols in length, used
to indicate the start of a comment line. For instance, it can be set to |
quote_char |
Single byte character used for quoting. Set to |
skip_rows |
Start reading after a particular number of rows. The header will be parsed at this offset. |
schema |
Provide the schema. This means that polars doesn't do schema
inference. This argument expects the complete schema, whereas
|
schema_overrides |
Overwrite dtypes during inference. This must be a list. Names of list elements are used to match to inferred columns. |
null_values |
Character vector specifying the values to interpret as
|
ignore_errors |
Keep reading the file even if some lines yield errors.
You can also use |
cache |
Cache the result after reading. |
infer_schema_length |
The maximum number of rows to scan for schema
inference. If |
n_rows |
Stop reading from the source after reading |
encoding |
Either |
low_memory |
Reduce memory pressure at the expense of performance. |
rechunk |
Reallocate to contiguous memory when all chunks/files are parsed. |
skip_rows_after_header |
Skip this number of rows when the header is parsed. |
row_index_name |
If not |
row_index_offset |
Offset to start the row index column (only used if
the name is set by |
try_parse_dates |
Try to automatically parse dates. Most ISO8601-like
formats can be inferred, as well as a handful of others. If this does not
succeed, the column remains of data type |
eol_char |
Single byte end of line character (default: |
raise_if_empty |
If |
truncate_ragged_lines |
Truncate lines that are longer than the schema. |
include_file_paths |
Include the path of the source file(s) as a column with this name. |
dtypes |
|
reuse_downloaded |
read_ipc_polars()
imports the data as a Polars DataFrame.
scan_ipc_polars()
imports the data as a Polars LazyFrame.
read_ipc_polars( source, ..., n_rows = NULL, row_index_name = NULL, row_index_offset = 0L, rechunk = FALSE, cache = TRUE, include_file_paths = NULL, memory_map ) scan_ipc_polars( source, ..., n_rows = NULL, row_index_name = NULL, row_index_offset = 0L, rechunk = FALSE, cache = TRUE, include_file_paths = NULL, memory_map )
read_ipc_polars( source, ..., n_rows = NULL, row_index_name = NULL, row_index_offset = 0L, rechunk = FALSE, cache = TRUE, include_file_paths = NULL, memory_map ) scan_ipc_polars( source, ..., n_rows = NULL, row_index_name = NULL, row_index_offset = 0L, rechunk = FALSE, cache = TRUE, include_file_paths = NULL, memory_map )
source |
Path(s) to a file or directory. When needing to authenticate
for scanning cloud locations, see the |
... |
These dots are for future extensions and must be empty. |
n_rows |
Stop reading from the source after reading |
row_index_name |
If not |
row_index_offset |
Offset to start the row index column (only used if
the name is set by |
rechunk |
Reallocate to contiguous memory when all chunks/files are parsed. |
cache |
Cache the result after reading. |
include_file_paths |
Include the path of the source file(s) as a column with this name. |
memory_map |
read_ndjson_polars()
imports the data as a Polars DataFrame.
scan_ndjson_polars()
imports the data as a Polars LazyFrame.
read_ndjson_polars( source, ..., infer_schema_length = 100, batch_size = NULL, n_rows = NULL, low_memory = FALSE, rechunk = FALSE, row_index_name = NULL, row_index_offset = 0, ignore_errors = FALSE, reuse_downloaded ) scan_ndjson_polars( source, ..., infer_schema_length = 100, batch_size = NULL, n_rows = NULL, low_memory = FALSE, rechunk = FALSE, row_index_name = NULL, row_index_offset = 0, ignore_errors = FALSE, reuse_downloaded )
read_ndjson_polars( source, ..., infer_schema_length = 100, batch_size = NULL, n_rows = NULL, low_memory = FALSE, rechunk = FALSE, row_index_name = NULL, row_index_offset = 0, ignore_errors = FALSE, reuse_downloaded ) scan_ndjson_polars( source, ..., infer_schema_length = 100, batch_size = NULL, n_rows = NULL, low_memory = FALSE, rechunk = FALSE, row_index_name = NULL, row_index_offset = 0, ignore_errors = FALSE, reuse_downloaded )
source |
Path(s) to a file or directory. When needing to authenticate
for scanning cloud locations, see the |
... |
These dots are for future extensions and must be empty. |
infer_schema_length |
The maximum number of rows to scan for schema
inference. If |
batch_size |
Number of rows to read in each batch. |
n_rows |
Stop reading from the source after reading |
low_memory |
Reduce memory pressure at the expense of performance. |
rechunk |
Reallocate to contiguous memory when all chunks/files are parsed. |
row_index_name |
If not |
row_index_offset |
Offset to start the row index column (only used if
the name is set by |
ignore_errors |
Keep reading the file even if some lines yield errors.
You can also use |
reuse_downloaded |
read_parquet_polars()
imports the data as a Polars DataFrame.
scan_parquet_polars()
imports the data as a Polars LazyFrame.
read_parquet_polars( source, ..., n_rows = NULL, row_index_name = NULL, row_index_offset = 0L, parallel = "auto", hive_partitioning = NULL, hive_schema = NULL, try_parse_hive_dates = TRUE, glob = TRUE, rechunk = TRUE, low_memory = FALSE, storage_options = NULL, use_statistics = TRUE, cache = TRUE, include_file_paths = NULL ) scan_parquet_polars( source, ..., n_rows = NULL, row_index_name = NULL, row_index_offset = 0L, parallel = "auto", hive_partitioning = NULL, hive_schema = NULL, try_parse_hive_dates = TRUE, glob = TRUE, rechunk = FALSE, low_memory = FALSE, storage_options = NULL, use_statistics = TRUE, cache = TRUE, include_file_paths = NULL )
read_parquet_polars( source, ..., n_rows = NULL, row_index_name = NULL, row_index_offset = 0L, parallel = "auto", hive_partitioning = NULL, hive_schema = NULL, try_parse_hive_dates = TRUE, glob = TRUE, rechunk = TRUE, low_memory = FALSE, storage_options = NULL, use_statistics = TRUE, cache = TRUE, include_file_paths = NULL ) scan_parquet_polars( source, ..., n_rows = NULL, row_index_name = NULL, row_index_offset = 0L, parallel = "auto", hive_partitioning = NULL, hive_schema = NULL, try_parse_hive_dates = TRUE, glob = TRUE, rechunk = FALSE, low_memory = FALSE, storage_options = NULL, use_statistics = TRUE, cache = TRUE, include_file_paths = NULL )
Most data operations are done on groups defined by variables. group_by()
takes an existing Polars Data/LazyFrame and converts it into a grouped one
where operations are performed "by group". ungroup()
removes grouping.
## S3 method for class 'polars_data_frame' group_by(.data, ..., maintain_order = FALSE, .add = FALSE) ## S3 method for class 'polars_data_frame' ungroup(x, ...) ## S3 method for class 'polars_lazy_frame' group_by(.data, ..., maintain_order = FALSE, .add = FALSE) ## S3 method for class 'polars_lazy_frame' ungroup(x, ...)
## S3 method for class 'polars_data_frame' group_by(.data, ..., maintain_order = FALSE, .add = FALSE) ## S3 method for class 'polars_data_frame' ungroup(x, ...) ## S3 method for class 'polars_lazy_frame' group_by(.data, ..., maintain_order = FALSE, .add = FALSE) ## S3 method for class 'polars_lazy_frame' ungroup(x, ...)
.data |
A Polars Data/LazyFrame |
... |
Variables to group by (used in |
maintain_order |
Maintain row order. For performance reasons, this is
|
.add |
When |
x |
A Polars Data/LazyFrame |
by_cyl <- mtcars |> as_polars_df() |> group_by(cyl) by_cyl by_cyl |> summarise( disp = mean(disp), hp = mean(hp) ) by_cyl |> filter(disp == max(disp))
by_cyl <- mtcars |> as_polars_df() |> group_by(cyl) by_cyl by_cyl |> summarise( disp = mean(disp), hp = mean(hp) ) by_cyl |> filter(disp == max(disp))
group_vars()
returns a character vector with the names of the grouping
variables. group_keys()
returns a data frame with one row per group.
## S3 method for class 'polars_data_frame' group_split(.tbl, ..., .keep = TRUE)
## S3 method for class 'polars_data_frame' group_split(.tbl, ..., .keep = TRUE)
.tbl |
A Polars Data/LazyFrame |
... |
If |
.keep |
Should the grouping columns be kept? |
pl_g <- polars::as_polars_df(iris) |> group_by(Species) group_split(pl_g)
pl_g <- polars::as_polars_df(iris) |> group_by(Species) group_split(pl_g)
group_vars()
returns a character vector with the names of the grouping
variables. group_keys()
returns a data frame with one row per group.
## S3 method for class 'polars_data_frame' group_vars(x) ## S3 method for class 'polars_lazy_frame' group_vars(x) ## S3 method for class 'polars_data_frame' group_keys(.tbl, ...) ## S3 method for class 'polars_lazy_frame' group_keys(.tbl, ...)
## S3 method for class 'polars_data_frame' group_vars(x) ## S3 method for class 'polars_lazy_frame' group_vars(x) ## S3 method for class 'polars_data_frame' group_keys(.tbl, ...) ## S3 method for class 'polars_lazy_frame' group_keys(.tbl, ...)
x , .tbl
|
A Polars Data/LazyFrame |
... |
These dots are for future extensions and must be empty. |
pl_g <- polars::as_polars_df(mtcars) |> group_by(cyl, am) group_vars(pl_g) group_keys(pl_g)
pl_g <- polars::as_polars_df(mtcars) |> group_by(cyl, am) group_vars(pl_g) group_keys(pl_g)
Mutating joins add columns from y
to x
, matching observations based on
the keys.
## S3 method for class 'polars_data_frame' left_join( x, y, by = NULL, copy = FALSE, suffix = c(".x", ".y"), ..., keep = NULL, na_matches = "na", relationship = NULL ) ## S3 method for class 'polars_data_frame' right_join( x, y, by = NULL, copy = FALSE, suffix = c(".x", ".y"), ..., keep = NULL, na_matches = "na", relationship = NULL ) ## S3 method for class 'polars_data_frame' full_join( x, y, by = NULL, copy = FALSE, suffix = c(".x", ".y"), ..., keep = NULL, na_matches = "na", relationship = NULL ) ## S3 method for class 'polars_data_frame' inner_join( x, y, by = NULL, copy = FALSE, suffix = c(".x", ".y"), ..., keep = NULL, na_matches = "na", relationship = NULL ) ## S3 method for class 'polars_lazy_frame' left_join( x, y, by = NULL, copy = FALSE, suffix = c(".x", ".y"), ..., keep = NULL, na_matches = "na", relationship = NULL ) ## S3 method for class 'polars_lazy_frame' right_join( x, y, by = NULL, copy = FALSE, suffix = c(".x", ".y"), ..., keep = NULL, na_matches = "na", relationship = NULL ) ## S3 method for class 'polars_lazy_frame' full_join( x, y, by = NULL, copy = FALSE, suffix = c(".x", ".y"), ..., keep = NULL, na_matches = "na", relationship = NULL ) ## S3 method for class 'polars_lazy_frame' inner_join( x, y, by = NULL, copy = FALSE, suffix = c(".x", ".y"), ..., keep = NULL, na_matches = "na", relationship = NULL )
## S3 method for class 'polars_data_frame' left_join( x, y, by = NULL, copy = FALSE, suffix = c(".x", ".y"), ..., keep = NULL, na_matches = "na", relationship = NULL ) ## S3 method for class 'polars_data_frame' right_join( x, y, by = NULL, copy = FALSE, suffix = c(".x", ".y"), ..., keep = NULL, na_matches = "na", relationship = NULL ) ## S3 method for class 'polars_data_frame' full_join( x, y, by = NULL, copy = FALSE, suffix = c(".x", ".y"), ..., keep = NULL, na_matches = "na", relationship = NULL ) ## S3 method for class 'polars_data_frame' inner_join( x, y, by = NULL, copy = FALSE, suffix = c(".x", ".y"), ..., keep = NULL, na_matches = "na", relationship = NULL ) ## S3 method for class 'polars_lazy_frame' left_join( x, y, by = NULL, copy = FALSE, suffix = c(".x", ".y"), ..., keep = NULL, na_matches = "na", relationship = NULL ) ## S3 method for class 'polars_lazy_frame' right_join( x, y, by = NULL, copy = FALSE, suffix = c(".x", ".y"), ..., keep = NULL, na_matches = "na", relationship = NULL ) ## S3 method for class 'polars_lazy_frame' full_join( x, y, by = NULL, copy = FALSE, suffix = c(".x", ".y"), ..., keep = NULL, na_matches = "na", relationship = NULL ) ## S3 method for class 'polars_lazy_frame' inner_join( x, y, by = NULL, copy = FALSE, suffix = c(".x", ".y"), ..., keep = NULL, na_matches = "na", relationship = NULL )
x , y
|
Two Polars Data/LazyFrames |
by |
Variables to join by. If
Finally, |
copy , keep
|
Not supported. |
suffix |
If there are non-joined duplicate variables in |
... |
Dots which should be empty. |
na_matches |
Should two
Note that when joining Polars Data/LazyFrames, |
relationship |
Handling of the expected relationship between the keys of
|
Arguments that are supported by the original implementation in the tidyverse
but are not listed above will throw a warning by default if they are
specified. To change this behavior to error instead, use
options(tidypolars_unknown_args = "error")
.
test <- polars::pl$DataFrame( x = c(1, 2, 3), y1 = c(1, 2, 3), z = c(1, 2, 3) ) test2 <- polars::pl$DataFrame( x = c(1, 2, 4), y2 = c(1, 2, 4), z2 = c(4, 5, 7) ) test test2 # default is to use common columns, here "x" only left_join(test, test2) # we can specify the columns on which to join with join_by()... left_join(test, test2, by = join_by(x, y1 == y2)) # ... or with a character vector left_join(test, test2, by = c("x", "y1" = "y2")) # we can customize the suffix of common column names not used to join test2 <- polars::pl$DataFrame( x = c(1, 2, 4), y1 = c(1, 2, 4), z = c(4, 5, 7) ) left_join(test, test2, by = "x", suffix = c("_left", "_right")) # the argument "relationship" ensures the join matches the expectation country <- polars::pl$DataFrame( iso = c("FRA", "DEU"), value = 1:2 ) country country_year <- polars::pl$DataFrame( iso = rep(c("FRA", "DEU"), each = 2), year = rep(2019:2020, 2), value2 = 3:6 ) country_year # We expect that each row in "x" matches only one row in "y" but, it's not # true as each row of "x" matches two rows of "y" tryCatch( left_join(country, country_year, join_by(iso), relationship = "one-to-one"), error = function(e) e ) # A correct expectation would be "one-to-many": left_join(country, country_year, join_by(iso), relationship = "one-to-many")
test <- polars::pl$DataFrame( x = c(1, 2, 3), y1 = c(1, 2, 3), z = c(1, 2, 3) ) test2 <- polars::pl$DataFrame( x = c(1, 2, 4), y2 = c(1, 2, 4), z2 = c(4, 5, 7) ) test test2 # default is to use common columns, here "x" only left_join(test, test2) # we can specify the columns on which to join with join_by()... left_join(test, test2, by = join_by(x, y1 == y2)) # ... or with a character vector left_join(test, test2, by = c("x", "y1" = "y2")) # we can customize the suffix of common column names not used to join test2 <- polars::pl$DataFrame( x = c(1, 2, 4), y1 = c(1, 2, 4), z = c(4, 5, 7) ) left_join(test, test2, by = "x", suffix = c("_left", "_right")) # the argument "relationship" ensures the join matches the expectation country <- polars::pl$DataFrame( iso = c("FRA", "DEU"), value = 1:2 ) country country_year <- polars::pl$DataFrame( iso = rep(c("FRA", "DEU"), each = 2), year = rep(2019:2020, 2), value2 = 3:6 ) country_year # We expect that each row in "x" matches only one row in "y" but, it's not # true as each row of "x" matches two rows of "y" tryCatch( left_join(country, country_year, join_by(iso), relationship = "one-to-one"), error = function(e) e ) # A correct expectation would be "one-to-many": left_join(country, country_year, join_by(iso), relationship = "one-to-many")
Create a column with unique id per row values
make_unique_id(.data, ..., new_col = "hash")
make_unique_id(.data, ..., new_col = "hash")
.data |
A Polars Data/LazyFrame |
... |
Any expression accepted by |
new_col |
Name of the new column |
mtcars |> as_polars_df() |> make_unique_id(am, gear)
mtcars |> as_polars_df() |> make_unique_id(am, gear)
This creates new columns that are functions of existing variables. It can also modify (if the name is the same as an existing column) and delete columns (by setting their value to NULL).
## S3 method for class 'polars_data_frame' mutate(.data, ..., .by = NULL, .keep = c("all", "used", "unused", "none")) ## S3 method for class 'polars_lazy_frame' mutate(.data, ..., .by = NULL, .keep = c("all", "used", "unused", "none"))
## S3 method for class 'polars_data_frame' mutate(.data, ..., .by = NULL, .keep = c("all", "used", "unused", "none")) ## S3 method for class 'polars_lazy_frame' mutate(.data, ..., .by = NULL, .keep = c("all", "used", "unused", "none"))
.data |
A Polars Data/LazyFrame |
... |
Name-value pairs. The name gives the name of the column in the output. The value can be:
|
.by |
Optionally, a selection of columns to group by for just this
operation, functioning as an alternative to |
.keep |
Control which columns from
|
A lot of functions available in base R (cos, mean, multiplying, etc.) or in other packages (dplyr::lag(), etc.) are implemented in an efficient way in Polars. These functions are automatically translated to Polars syntax under the hood so that you can continue using the classic R syntax and functions.
If a Polars built-in replacement doesn't exist (for example for custom
functions), then tidypolars
will throw an error. See the vignette on Polars
expressions to know how to write custom functions that are accepted by
tidypolars
.
pl_iris <- polars::as_polars_df(iris) # classic operation mutate(pl_iris, x = Sepal.Width + Sepal.Length) # logical operation mutate(pl_iris, x = Sepal.Width > Sepal.Length & Petal.Width > Petal.Length) # overwrite existing variable mutate(pl_iris, Sepal.Width = Sepal.Width * 2) # grouped computation pl_iris |> group_by(Species) |> mutate(foo = mean(Sepal.Length)) # an alternative syntax for grouping is to use `.by` pl_iris |> mutate(foo = mean(Sepal.Length), .by = Species) # across() is available pl_iris |> mutate( across(.cols = contains("Sepal"), .fns = mean, .names = "{.fn}_of_{.col}") ) # # It can receive several types of functions: pl_iris |> mutate( across( .cols = contains("Sepal"), .fns = list(mean = mean, sd = ~ sd(.x)), .names = "{.fn}_of_{.col}" ) ) # Be careful when using across(.cols = where(...), ...) as it will not include # variables created in the same `...` (this is only the case for `where()`): ## Not run: pl_iris |> mutate( foo = 1, across( .cols = where(is.numeric), \(x) x - 1000 # <<<<<<<<< this will not be applied on variable "foo" ) ) ## End(Not run) # Warning message: # In `across()`, the argument `.cols = where(is.numeric)` will not take into account # variables created in the same `mutate()`/`summarize` call. # Embracing an external variable works some_value <- 1 mutate(pl_iris, x = {{ some_value }})
pl_iris <- polars::as_polars_df(iris) # classic operation mutate(pl_iris, x = Sepal.Width + Sepal.Length) # logical operation mutate(pl_iris, x = Sepal.Width > Sepal.Length & Petal.Width > Petal.Length) # overwrite existing variable mutate(pl_iris, Sepal.Width = Sepal.Width * 2) # grouped computation pl_iris |> group_by(Species) |> mutate(foo = mean(Sepal.Length)) # an alternative syntax for grouping is to use `.by` pl_iris |> mutate(foo = mean(Sepal.Length), .by = Species) # across() is available pl_iris |> mutate( across(.cols = contains("Sepal"), .fns = mean, .names = "{.fn}_of_{.col}") ) # # It can receive several types of functions: pl_iris |> mutate( across( .cols = contains("Sepal"), .fns = list(mean = mean, sd = ~ sd(.x)), .names = "{.fn}_of_{.col}" ) ) # Be careful when using across(.cols = where(...), ...) as it will not include # variables created in the same `...` (this is only the case for `where()`): ## Not run: pl_iris |> mutate( foo = 1, across( .cols = where(is.numeric), \(x) x - 1000 # <<<<<<<<< this will not be applied on variable "foo" ) ) ## End(Not run) # Warning message: # In `across()`, the argument `.cols = where(is.numeric)` will not take into account # variables created in the same `mutate()`/`summarize` call. # Embracing an external variable works some_value <- 1 mutate(pl_iris, x = {{ some_value }})
Pivot a Data/LazyFrame from wide to long
## S3 method for class 'polars_data_frame' pivot_longer( data, cols, ..., names_to = "name", names_prefix = NULL, values_to = "value" ) ## S3 method for class 'polars_lazy_frame' pivot_longer( data, cols, ..., names_to = "name", names_prefix = NULL, values_to = "value" )
## S3 method for class 'polars_data_frame' pivot_longer( data, cols, ..., names_to = "name", names_prefix = NULL, values_to = "value" ) ## S3 method for class 'polars_lazy_frame' pivot_longer( data, cols, ..., names_to = "name", names_prefix = NULL, values_to = "value" )
data |
A Polars Data/LazyFrame |
cols |
Columns to pivot into longer format. Can be anything accepted by
|
... |
Dots which should be empty. |
names_to |
The (quoted) name of the column that will contain the column
names specified by |
names_prefix |
A regular expression used to remove matching text from the start of each variable name. |
values_to |
A string specifying the name of the column to create from the data stored in cell values. |
Arguments that are supported by the original implementation in the tidyverse
but are not listed above will throw a warning by default if they are
specified. To change this behavior to error instead, use
options(tidypolars_unknown_args = "error")
.
pl_relig_income <- as_polars_df(tidyr::relig_income) pl_relig_income pl_relig_income |> pivot_longer(!religion, names_to = "income", values_to = "count") pl_billboard <- as_polars_df(tidyr::billboard) pl_billboard pl_billboard |> pivot_longer( cols = starts_with("wk"), names_to = "week", names_prefix = "wk", values_to = "rank", )
pl_relig_income <- as_polars_df(tidyr::relig_income) pl_relig_income pl_relig_income |> pivot_longer(!religion, names_to = "income", values_to = "count") pl_billboard <- as_polars_df(tidyr::billboard) pl_billboard pl_billboard |> pivot_longer( cols = starts_with("wk"), names_to = "week", names_prefix = "wk", values_to = "rank", )
Pivot a DataFrame from long to wide
## S3 method for class 'polars_data_frame' pivot_wider( data, ..., id_cols = NULL, names_from = name, values_from = value, names_prefix = "", names_sep = "_", names_glue = NULL, values_fill = NULL )
## S3 method for class 'polars_data_frame' pivot_wider( data, ..., id_cols = NULL, names_from = name, values_from = value, names_prefix = "", names_sep = "_", names_glue = NULL, values_fill = NULL )
data |
A Polars DataFrame (LazyFrames are not supported). |
... |
Dots which should be empty. |
id_cols |
A set of columns that uniquely identify each observation. Typically used when you have redundant variables, i.e. variables whose values are perfectly correlated with existing variables. Defaults to all columns in data except for the columns specified through
|
names_from |
The (quoted or unquoted) column names whose values will be used for the names of the new columns. |
values_from |
The (quoted or unquoted) column names whose values will be used to fill the new columns. |
names_prefix |
String added to the start of every variable name. This is
particularly useful if |
names_sep |
If |
names_glue |
Instead of |
values_fill |
A scalar that will be used to replace missing values in the new columns. Note that the type of this value will be applied to new columns. For example, if you provide a character value to fill numeric columns, then all these columns will be converted to character. |
Arguments that are supported by the original implementation in the tidyverse
but are not listed above will throw a warning by default if they are
specified. To change this behavior to error instead, use
options(tidypolars_unknown_args = "error")
.
pl_fish_encounters <- as_polars_df(tidyr::fish_encounters) pl_fish_encounters |> pivot_wider(names_from = station, values_from = seen) pl_fish_encounters |> pivot_wider(names_from = station, values_from = seen, values_fill = 0) # be careful about the type of the replacement value! pl_fish_encounters |> pivot_wider(names_from = station, values_from = seen, values_fill = "a") # using "names_glue" to specify the names of new columns production <- expand.grid( product = c("A", "B"), country = c("AI", "EI"), year = 2000:2014 ) |> filter((product == "A" & country == "AI") | product == "B") |> mutate(production = 1:45) |> as_polars_df() production production |> pivot_wider( names_from = c(product, country), values_from = production, names_glue = "prod_{product}_{country}" )
pl_fish_encounters <- as_polars_df(tidyr::fish_encounters) pl_fish_encounters |> pivot_wider(names_from = station, values_from = seen) pl_fish_encounters |> pivot_wider(names_from = station, values_from = seen, values_fill = 0) # be careful about the type of the replacement value! pl_fish_encounters |> pivot_wider(names_from = station, values_from = seen, values_fill = "a") # using "names_glue" to specify the names of new columns production <- expand.grid( product = c("A", "B"), country = c("AI", "EI"), year = 2000:2014 ) |> filter((product == "A" & country == "AI") | product == "B") |> mutate(production = 1:45) |> as_polars_df() production production |> pivot_wider( names_from = c(product, country), values_from = production, names_glue = "prod_{product}_{country}" )
This returns an R vector and not a Polars Series.
## S3 method for class 'polars_data_frame' pull(.data, var, ...) ## S3 method for class 'polars_lazy_frame' pull(.data, var, ...)
## S3 method for class 'polars_data_frame' pull(.data, var, ...) ## S3 method for class 'polars_lazy_frame' pull(.data, var, ...)
.data |
A Polars Data/LazyFrame |
var |
A quoted or unquoted variable name, or a variable index. |
... |
Dots which should be empty. |
pl_test <- as_polars_df(iris) pull(pl_test, Sepal.Length) pull(pl_test, "Sepal.Length")
pl_test <- as_polars_df(iris) pull(pl_test, Sepal.Length) pull(pl_test, "Sepal.Length")
Use relocate()
to change column positions, using the same syntax as
select()
to make it easy to move blocks of columns at once.
## S3 method for class 'polars_data_frame' relocate(.data, ..., .before = NULL, .after = NULL) ## S3 method for class 'polars_lazy_frame' relocate(.data, ..., .before = NULL, .after = NULL)
## S3 method for class 'polars_data_frame' relocate(.data, ..., .before = NULL, .after = NULL) ## S3 method for class 'polars_lazy_frame' relocate(.data, ..., .before = NULL, .after = NULL)
.data |
A Polars Data/LazyFrame |
... |
Any expression accepted by |
.before , .after
|
Column name (either quoted or unquoted) that
indicates the destination of columns selected by |
dat <- as_polars_df(mtcars) dat |> relocate(hp, vs, .before = cyl) # if .before and .after are not specified, selected columns are moved to the # first positions dat |> relocate(hp, vs) # .before and .after can be quoted or unquoted dat |> relocate(hp, vs, .after = "gear") # select helpers are also available dat |> relocate(contains("[aeiou]")) dat |> relocate(hp, vs, .after = last_col())
dat <- as_polars_df(mtcars) dat |> relocate(hp, vs, .before = cyl) # if .before and .after are not specified, selected columns are moved to the # first positions dat |> relocate(hp, vs) # .before and .after can be quoted or unquoted dat |> relocate(hp, vs, .after = "gear") # select helpers are also available dat |> relocate(contains("[aeiou]")) dat |> relocate(hp, vs, .after = last_col())
Rename columns
## S3 method for class 'polars_data_frame' rename(.data, ...) ## S3 method for class 'polars_lazy_frame' rename(.data, ...) ## S3 method for class 'polars_data_frame' rename_with(.data, .fn, .cols = tidyselect::everything(), ...) ## S3 method for class 'polars_lazy_frame' rename_with(.data, .fn, .cols = tidyselect::everything(), ...)
## S3 method for class 'polars_data_frame' rename(.data, ...) ## S3 method for class 'polars_lazy_frame' rename(.data, ...) ## S3 method for class 'polars_data_frame' rename_with(.data, .fn, .cols = tidyselect::everything(), ...) ## S3 method for class 'polars_lazy_frame' rename_with(.data, .fn, .cols = tidyselect::everything(), ...)
.data |
A Polars Data/LazyFrame |
... |
For For |
.fn |
Function to apply on column names |
.cols |
Columns on which to apply |
pl_test <- polars::as_polars_df(mtcars) rename(pl_test, miles_per_gallon = mpg, horsepower = "hp") rename(pl_test, `Miles per gallon` = "mpg", `Horse power` = "hp") rename_with(pl_test, toupper, .cols = contains("p")) pl_test_2 <- polars::as_polars_df(iris) rename_with(pl_test_2, function(x) tolower(gsub(".", "_", x, fixed = TRUE))) rename_with(pl_test_2, \(x) tolower(gsub(".", "_", x, fixed = TRUE)))
pl_test <- polars::as_polars_df(mtcars) rename(pl_test, miles_per_gallon = mpg, horsepower = "hp") rename(pl_test, `Miles per gallon` = "mpg", `Horse power` = "hp") rename_with(pl_test, toupper, .cols = contains("p")) pl_test_2 <- polars::as_polars_df(iris) rename_with(pl_test_2, function(x) tolower(gsub(".", "_", x, fixed = TRUE))) rename_with(pl_test_2, \(x) tolower(gsub(".", "_", x, fixed = TRUE)))
Replace NAs with specified values
## S3 method for class 'polars_data_frame' replace_na(data, replace, ...) ## S3 method for class 'polars_lazy_frame' replace_na(data, replace, ...)
## S3 method for class 'polars_data_frame' replace_na(data, replace, ...) ## S3 method for class 'polars_lazy_frame' replace_na(data, replace, ...)
data |
A Polars Data/LazyFrame |
replace |
Either a scalar that will be used to replace |
... |
Dots which should be empty. |
pl_test <- polars::pl$DataFrame(x = c(NA, 1), y = c(2, NA)) # replace all NA with 0 replace_na(pl_test, 0) # custom replacement per column replace_na(pl_test, list(x = 0, y = 999))
pl_test <- polars::pl$DataFrame(x = c(NA, 1), y = c(2, NA)) # replace all NA with 0 replace_na(pl_test, 0) # custom replacement per column replace_na(pl_test, list(x = 0, y = 999))
[EXPERIMENTAL]
rowwise()
allows you to compute on a Data/LazyFrame a row-at-a-time. This
is most useful when a vectorised function doesn't exist. rowwise()
produces
another type of grouped data, and therefore can be removed with ungroup()
.
## S3 method for class 'polars_data_frame' rowwise(data, ...) ## S3 method for class 'polars_lazy_frame' rowwise(data, ...)
## S3 method for class 'polars_data_frame' rowwise(data, ...) ## S3 method for class 'polars_lazy_frame' rowwise(data, ...)
data |
A Polars Data/LazyFrame |
... |
Any expression accepted by |
A Polars Data/LazyFrame.
df <- polars::pl$DataFrame(x = c(1, 3, 4), y = c(2, 1, 5), z = c(2, 3, 1)) # Compute the mean of x, y, z in each row df |> rowwise() |> mutate(m = mean(c(x, y, z))) # Compute the min and max of x and y in each row df |> rowwise() |> mutate(min = min(c(x, y)), max = max(c(x, y)))
df <- polars::pl$DataFrame(x = c(1, 3, 4), y = c(2, 1, 5), z = c(2, 3, 1)) # Compute the mean of x, y, z in each row df |> rowwise() |> mutate(m = mean(c(x, y, z))) # Compute the min and max of x and y in each row df |> rowwise() |> mutate(min = min(c(x, y)), max = max(c(x, y)))
Select columns from a Data/LazyFrame
## S3 method for class 'polars_data_frame' select(.data, ...) ## S3 method for class 'polars_lazy_frame' select(.data, ...)
## S3 method for class 'polars_data_frame' select(.data, ...) ## S3 method for class 'polars_lazy_frame' select(.data, ...)
.data |
A Polars Data/LazyFrame |
... |
Any expression accepted by |
pl_iris <- polars::as_polars_df(iris) select(pl_iris, c("Sepal.Length", "Sepal.Width")) select(pl_iris, Sepal.Length, Sepal.Width) select(pl_iris, 1:3) select(pl_iris, starts_with("Sepal")) select(pl_iris, -ends_with("Length")) # Renaming while selecting is also possible select(pl_iris, foo1 = Sepal.Length, Sepal.Width)
pl_iris <- polars::as_polars_df(iris) select(pl_iris, c("Sepal.Length", "Sepal.Width")) select(pl_iris, Sepal.Length, Sepal.Width) select(pl_iris, 1:3) select(pl_iris, starts_with("Sepal")) select(pl_iris, -ends_with("Length")) # Renaming while selecting is also possible select(pl_iris, foo1 = Sepal.Length, Sepal.Width)
Filtering joins filter rows from x
based on the presence or absence of
matches in y
:
semi_join()
return all rows from x
with a match in y
.
anti_join()
return all rows from x
without a match in y
.
## S3 method for class 'polars_data_frame' semi_join(x, y, by = NULL, ..., na_matches = "na") ## S3 method for class 'polars_data_frame' anti_join(x, y, by = NULL, ..., na_matches = "na") ## S3 method for class 'polars_lazy_frame' semi_join(x, y, by = NULL, ..., na_matches = "na") ## S3 method for class 'polars_lazy_frame' anti_join(x, y, by = NULL, ..., na_matches = "na")
## S3 method for class 'polars_data_frame' semi_join(x, y, by = NULL, ..., na_matches = "na") ## S3 method for class 'polars_data_frame' anti_join(x, y, by = NULL, ..., na_matches = "na") ## S3 method for class 'polars_lazy_frame' semi_join(x, y, by = NULL, ..., na_matches = "na") ## S3 method for class 'polars_lazy_frame' anti_join(x, y, by = NULL, ..., na_matches = "na")
x , y
|
Two Polars Data/LazyFrames |
by |
Variables to join by. If
Finally, |
... |
Dots which should be empty. |
na_matches |
Should two
Note that when joining Polars Data/LazyFrames, |
Arguments that are supported by the original implementation in the tidyverse
but are not listed above will throw a warning by default if they are
specified. To change this behavior to error instead, use
options(tidypolars_unknown_args = "error")
.
test <- polars::pl$DataFrame( x = c(1, 2, 3), y = c(1, 2, 3), z = c(1, 2, 3) ) test2 <- polars::pl$DataFrame( x = c(1, 2, 4), y = c(1, 2, 4), z2 = c(1, 2, 4) ) test test2 # only keep the rows of `test` that have matching keys in `test2` semi_join(test, test2, by = c("x", "y")) # only keep the rows of `test` that don't have matching keys in `test2` anti_join(test, test2, by = c("x", "y"))
test <- polars::pl$DataFrame( x = c(1, 2, 3), y = c(1, 2, 3), z = c(1, 2, 3) ) test2 <- polars::pl$DataFrame( x = c(1, 2, 4), y = c(1, 2, 4), z2 = c(1, 2, 4) ) test test2 # only keep the rows of `test` that have matching keys in `test2` semi_join(test, test2, by = c("x", "y")) # only keep the rows of `test` that don't have matching keys in `test2` anti_join(test, test2, by = c("x", "y"))
Currently, splitting a column on a regular expression or position is not possible.
## S3 method for class 'polars_data_frame' separate(data, col, into, sep = " ", remove = TRUE, ...) ## S3 method for class 'polars_lazy_frame' separate(data, col, into, sep = " ", remove = TRUE, ...)
## S3 method for class 'polars_data_frame' separate(data, col, into, sep = " ", remove = TRUE, ...) ## S3 method for class 'polars_lazy_frame' separate(data, col, into, sep = " ", remove = TRUE, ...)
data |
A Polars Data/LazyFrame |
col |
Column to split |
into |
Character vector containing the names of new variables to create.
Use |
sep |
String that is used to split the column. Regular expressions are not supported yet. |
remove |
If |
... |
Dots which should be empty. |
test <- polars::pl$DataFrame( x = c(NA, "x.y", "x.z", "y.z") ) separate(test, x, into = c("foo", "foo2"), sep = ".")
test <- polars::pl$DataFrame( x = c(NA, "x.y", "x.z", "y.z") ) separate(test, x, into = c("foo", "foo2"), sep = ".")
This function allows to stream a LazyFrame that is larger than RAM directly
to a .csv
file without collecting it in the R session, thus preventing
crashes because of too small memory.
sink_csv( .data, path, ..., include_bom = FALSE, include_header = TRUE, separator = ",", line_terminator = "\n", quote_char = "\"", batch_size = 1024, datetime_format = NULL, date_format = NULL, time_format = NULL, float_precision = NULL, null_value = "", quote_style = "necessary", maintain_order = TRUE, type_coercion = TRUE, predicate_pushdown = TRUE, projection_pushdown = TRUE, simplify_expression = TRUE, slice_pushdown = TRUE, no_optimization = FALSE, quote, null_values )
sink_csv( .data, path, ..., include_bom = FALSE, include_header = TRUE, separator = ",", line_terminator = "\n", quote_char = "\"", batch_size = 1024, datetime_format = NULL, date_format = NULL, time_format = NULL, float_precision = NULL, null_value = "", quote_style = "necessary", maintain_order = TRUE, type_coercion = TRUE, predicate_pushdown = TRUE, projection_pushdown = TRUE, simplify_expression = TRUE, slice_pushdown = TRUE, no_optimization = FALSE, quote, null_values )
.data |
A Polars LazyFrame. |
path |
Output file (must be a |
... |
Ignored. |
include_bom |
Whether to include UTF-8 BOM (byte order mark) in the CSV output. |
include_header |
Whether to include header in the CSV output. |
separator |
Separate CSV fields with this symbol. |
line_terminator |
String used to end each row. |
quote_char |
Byte to use as quoting character. |
batch_size |
Number of rows that will be processed per thread. |
datetime_format , date_format , time_format
|
A format string used to format
date and time values. See |
float_precision |
Number of decimal places to write, applied to both
|
null_value |
A string representing null values (defaulting to the empty string). |
quote_style |
Determines the quoting strategy used:
|
maintain_order |
Whether maintain the order the data was processed
(default is |
type_coercion |
Coerce types such that operations succeed and run on
minimal required memory (default is |
predicate_pushdown |
Applies filters as early as possible at scan level
(default is |
projection_pushdown |
Select only the columns that are needed at the
scan level (default is |
simplify_expression |
Various optimizations, such as constant folding
and replacing expensive operations with faster alternatives (default is
|
slice_pushdown |
Only load the required slice from the scan. Don't
materialize sliced outputs level. Don't materialize sliced outputs (default
is |
no_optimization |
Sets the following optimizations to |
quote |
|
null_values |
The input LazyFrame.
## Not run: # This is an example workflow where sink_csv() is not very useful because # the data would fit in memory. It simply is an example of using it at the # end of a piped workflow. # Create files for the CSV input and output: file_csv <- tempfile(fileext = ".csv") file_csv2 <- tempfile(fileext = ".csv") # Write some data in a CSV file fake_data <- do.call("rbind", rep(list(mtcars), 1000)) write.csv(fake_data, file = file_csv, row.names = FALSE) # In a new R session, we could read this file as a LazyFrame, do some operations, # and write it to another CSV file without ever collecting it in the R session: scan_csv_polars(file_csv) |> filter(cyl %in% c(4, 6), mpg > 22) |> mutate( hp_gear_ratio = hp / gear ) |> sink_csv(path = file_csv2) ## End(Not run)
## Not run: # This is an example workflow where sink_csv() is not very useful because # the data would fit in memory. It simply is an example of using it at the # end of a piped workflow. # Create files for the CSV input and output: file_csv <- tempfile(fileext = ".csv") file_csv2 <- tempfile(fileext = ".csv") # Write some data in a CSV file fake_data <- do.call("rbind", rep(list(mtcars), 1000)) write.csv(fake_data, file = file_csv, row.names = FALSE) # In a new R session, we could read this file as a LazyFrame, do some operations, # and write it to another CSV file without ever collecting it in the R session: scan_csv_polars(file_csv) |> filter(cyl %in% c(4, 6), mpg > 22) |> mutate( hp_gear_ratio = hp / gear ) |> sink_csv(path = file_csv2) ## End(Not run)
This function allows to stream a LazyFrame that is larger than RAM directly to an IPC file without collecting it in the R session, thus preventing crashes because of too small memory.
sink_ipc( .data, path, ..., compression = "zstd", compat_level = "newest", maintain_order = TRUE, type_coercion = TRUE, predicate_pushdown = TRUE, projection_pushdown = TRUE, simplify_expression = TRUE, slice_pushdown = TRUE, no_optimization = FALSE )
sink_ipc( .data, path, ..., compression = "zstd", compat_level = "newest", maintain_order = TRUE, type_coercion = TRUE, predicate_pushdown = TRUE, projection_pushdown = TRUE, simplify_expression = TRUE, slice_pushdown = TRUE, no_optimization = FALSE )
.data |
A Polars LazyFrame. |
path |
Output file. |
... |
Ignored. |
compression |
|
compat_level |
Determines the compatibility level when exporting Polars'
internal data structures. When specifying a new compatibility level, Polars
exports its internal data structures that might not be interpretable by other
Arrow implementations. The level can be specified as the name (e.g.,
|
maintain_order |
Whether maintain the order the data was processed
(default is |
type_coercion |
Coerce types such that operations succeed and run on
minimal required memory (default is |
predicate_pushdown |
Applies filters as early as possible at scan level
(default is |
projection_pushdown |
Select only the columns that are needed at the
scan level (default is |
simplify_expression |
Various optimizations, such as constant folding
and replacing expensive operations with faster alternatives (default is
|
slice_pushdown |
Only load the required slice from the scan. Don't
materialize sliced outputs level. Don't materialize sliced outputs (default
is |
no_optimization |
Sets the following optimizations to |
The input LazyFrame.
This writes the output of a query directly to a NDJSON file without collecting it in the R session first. This is useful if the output of the query is still larger than RAM as it would crash the R session if it was collected into R.
sink_ndjson( .data, path, ..., maintain_order = TRUE, type_coercion = TRUE, predicate_pushdown = TRUE, projection_pushdown = TRUE, simplify_expression = TRUE, slice_pushdown = TRUE, no_optimization = FALSE )
sink_ndjson( .data, path, ..., maintain_order = TRUE, type_coercion = TRUE, predicate_pushdown = TRUE, projection_pushdown = TRUE, simplify_expression = TRUE, slice_pushdown = TRUE, no_optimization = FALSE )
.data |
A Polars LazyFrame. |
path |
Output file. |
... |
Ignored. |
maintain_order |
Whether maintain the order the data was processed
(default is |
type_coercion |
Coerce types such that operations succeed and run on
minimal required memory (default is |
predicate_pushdown |
Applies filters as early as possible at scan level
(default is |
projection_pushdown |
Select only the columns that are needed at the
scan level (default is |
simplify_expression |
Various optimizations, such as constant folding
and replacing expensive operations with faster alternatives (default is
|
slice_pushdown |
Only load the required slice from the scan. Don't
materialize sliced outputs level. Don't materialize sliced outputs (default
is |
no_optimization |
Sets the following optimizations to |
The input LazyFrame.
This function allows to stream a LazyFrame that is larger than RAM directly
to a .parquet
file without collecting it in the R session, thus preventing
crashes because of too small memory.
sink_parquet( .data, path, ..., compression = "zstd", compression_level = 3, statistics = FALSE, row_group_size = NULL, data_page_size = NULL, maintain_order = TRUE, type_coercion = TRUE, predicate_pushdown = TRUE, projection_pushdown = TRUE, simplify_expression = TRUE, slice_pushdown = TRUE, no_optimization = FALSE )
sink_parquet( .data, path, ..., compression = "zstd", compression_level = 3, statistics = FALSE, row_group_size = NULL, data_page_size = NULL, maintain_order = TRUE, type_coercion = TRUE, predicate_pushdown = TRUE, projection_pushdown = TRUE, simplify_expression = TRUE, slice_pushdown = TRUE, no_optimization = FALSE )
.data |
A Polars LazyFrame. |
path |
Output file (must be a |
... |
Ignored. |
compression |
The compression method. One of :
|
compression_level |
The level of compression to use (default is 3). Only
used if
|
statistics |
Whether to compute and write column statistics (default is
|
row_group_size |
Size of the row groups in number of rows. If |
data_page_size |
If |
maintain_order |
Whether maintain the order the data was processed
(default is |
type_coercion |
Coerce types such that operations succeed and run on
minimal required memory (default is |
predicate_pushdown |
Applies filters as early as possible at scan level
(default is |
projection_pushdown |
Select only the columns that are needed at the
scan level (default is |
simplify_expression |
Various optimizations, such as constant folding
and replacing expensive operations with faster alternatives (default is
|
slice_pushdown |
Only load the required slice from the scan. Don't
materialize sliced outputs level. Don't materialize sliced outputs (default
is |
no_optimization |
Sets the following optimizations to |
The input LazyFrame.
## Not run: # This is an example workflow where sink_parquet() is not very useful because # the data would fit in memory. It simply is an example of using it at the # end of a piped workflow. # Create files for the CSV input and the Parquet output: file_csv <- tempfile(fileext = ".csv") file_parquet <- tempfile(fileext = ".parquet") # Write some data in a CSV file fake_data <- do.call("rbind", rep(list(mtcars), 1000)) write.csv(fake_data, file = file_csv, row.names = FALSE) # In a new R session, we could read this file as a LazyFrame, do some operations, # and write it to a parquet file without ever collecting it in the R session: scan_csv_polars(file_csv) |> filter(cyl %in% c(4, 6), mpg > 22) |> mutate( hp_gear_ratio = hp / gear ) |> sink_parquet(path = file_parquet) ## End(Not run)
## Not run: # This is an example workflow where sink_parquet() is not very useful because # the data would fit in memory. It simply is an example of using it at the # end of a piped workflow. # Create files for the CSV input and the Parquet output: file_csv <- tempfile(fileext = ".csv") file_parquet <- tempfile(fileext = ".parquet") # Write some data in a CSV file fake_data <- do.call("rbind", rep(list(mtcars), 1000)) write.csv(fake_data, file = file_csv, row.names = FALSE) # In a new R session, we could read this file as a LazyFrame, do some operations, # and write it to a parquet file without ever collecting it in the R session: scan_csv_polars(file_csv) |> filter(cyl %in% c(4, 6), mpg > 22) |> mutate( hp_gear_ratio = hp / gear ) |> sink_parquet(path = file_parquet) ## End(Not run)
Subset rows of a Data/LazyFrame
## S3 method for class 'polars_data_frame' slice_tail(.data, ..., n, by = NULL) ## S3 method for class 'polars_lazy_frame' slice_tail(.data, ..., n, by = NULL) ## S3 method for class 'polars_data_frame' slice_head(.data, ..., n, by = NULL) ## S3 method for class 'polars_lazy_frame' slice_head(.data, ..., n, by = NULL) ## S3 method for class 'polars_data_frame' slice_sample(.data, ..., n = NULL, prop = NULL, by = NULL, replace = FALSE)
## S3 method for class 'polars_data_frame' slice_tail(.data, ..., n, by = NULL) ## S3 method for class 'polars_lazy_frame' slice_tail(.data, ..., n, by = NULL) ## S3 method for class 'polars_data_frame' slice_head(.data, ..., n, by = NULL) ## S3 method for class 'polars_lazy_frame' slice_head(.data, ..., n, by = NULL) ## S3 method for class 'polars_data_frame' slice_sample(.data, ..., n = NULL, prop = NULL, by = NULL, replace = FALSE)
.data |
A Polars Data/LazyFrame |
... |
Dots which should be empty. |
n |
The number of rows to select from the start or the end of the data.
Cannot be used with |
by |
Optionally, a selection of columns to group by for just this
operation, functioning as an alternative to |
prop |
Proportion of rows to select. Cannot be used with |
replace |
Perform the sampling with replacement ( |
Arguments that are supported by the original implementation in the tidyverse
but are not listed above will throw a warning by default if they are
specified. To change this behavior to error instead, use
options(tidypolars_unknown_args = "error")
.
pl_test <- polars::as_polars_df(iris) slice_head(pl_test, n = 3) slice_tail(pl_test, n = 3) slice_sample(pl_test, n = 5) slice_sample(pl_test, prop = 0.1)
pl_test <- polars::as_polars_df(iris) slice_head(pl_test, n = 3) slice_tail(pl_test, n = 3) slice_sample(pl_test, n = 5) slice_sample(pl_test, prop = 0.1)
summarize()
returns one row for each combination of grouping variables
(one difference with dplyr::summarize()
is that summarize()
only
accepts grouped data). It will contain one column for each grouping variable
and one column for each of the summary statistics that you have specified.
## S3 method for class 'polars_data_frame' summarize(.data, ..., .by = NULL, .groups = "drop_last") ## S3 method for class 'polars_data_frame' summarise(.data, ..., .by = NULL, .groups = "drop_last") ## S3 method for class 'polars_lazy_frame' summarize(.data, ..., .by = NULL, .groups = "drop_last") ## S3 method for class 'polars_lazy_frame' summarise(.data, ..., .by = NULL, .groups = "drop_last")
## S3 method for class 'polars_data_frame' summarize(.data, ..., .by = NULL, .groups = "drop_last") ## S3 method for class 'polars_data_frame' summarise(.data, ..., .by = NULL, .groups = "drop_last") ## S3 method for class 'polars_lazy_frame' summarize(.data, ..., .by = NULL, .groups = "drop_last") ## S3 method for class 'polars_lazy_frame' summarise(.data, ..., .by = NULL, .groups = "drop_last")
.data |
A Polars Data/LazyFrame |
... |
Name-value pairs. The name gives the name of the column in the output. The value can be:
|
.by |
Optionally, a selection of columns to group by for just this
operation, functioning as an alternative to |
.groups |
Grouping structure of the result. Must be one of:
For now, |
mtcars |> as_polars_df() |> group_by(cyl) |> summarize(m_gear = mean(gear), sd_gear = sd(gear)) # an alternative syntax is to use `.by` mtcars |> as_polars_df() |> summarize(m_gear = mean(gear), sd_gear = sd(gear), .by = cyl)
mtcars |> as_polars_df() |> group_by(cyl) |> summarize(m_gear = mean(gear), sd_gear = sd(gear)) # an alternative syntax is to use `.by` mtcars |> as_polars_df() |> summarize(m_gear = mean(gear), sd_gear = sd(gear), .by = cyl)
Summary statistics for a Polars DataFrame
## S3 method for class 'polars_data_frame' summary(object, percentiles = c(0.25, 0.5, 0.75), ...)
## S3 method for class 'polars_data_frame' summary(object, percentiles = c(0.25, 0.5, 0.75), ...)
object |
A Polars DataFrame. |
percentiles |
One or more percentiles to include in the summary
statistics. All values must be between 0 and 1 ( |
... |
Ignored. |
mtcars |> as_polars_df() |> summary(percentiles = c(0.2, 0.4, 0.6, 0.8))
mtcars |> as_polars_df() |> summary(percentiles = c(0.2, 0.4, 0.6, 0.8))
tidypolars
global optionstidypolars
has the following global options:
tidypolars_unknown_args
controls what happens when some arguments passed
in an expression are unknown, e.g the argument prob
in sample()
. The
default ("warn"
) only warns the user that some arguments are ignored by
tidypolars
. The only other accepted value is "error"
to throw an
error when this happens.
tidypolars_fallback_to_r
controls what happens when an unknown function
(that isn't translated to use polars syntax) is passed in an expression.
The default is FALSE
, meaning that unknown functions will trigger an
error. Setting this option to TRUE
will convert the data to R, apply the
unknown function, and convert the output back to polars.
Using the fallback to R has several drawbacks:
it loses some of polars built-in parallelism and other optimizations;
the session may crash or experience a severe slowdown when the data is converted to R (especially if the input is a LazyFrame).
The package polars
also contains several global options that may be useful,
such as changing the default behavior when converting Int64 values to R:
https://pola-rs.github.io/r-polars/man/polars_options.html.
##### Unknown arguments options(tidypolars_unknown_args = "warn") test <- polars::pl$DataFrame(x = c(2, 1, 5, 3, 1)) # The default is to warn the user mutate(test, x2 = sample(x, prob = 0.5)) # But one can make this stricter and throw an error when this happens options(tidypolars_unknown_args = "error") try(mutate(test, x2 = sample(x, prob = 0.5))) options(tidypolars_unknown_args = "warn") ##### Fallback to R test <- polars::pl$DataFrame(x = c(2, 1, 5, 3, 1)) # The default is to error because mad() isn't translated internally try(mutate(test, x2 = mad(x))) # But one can allow fallback to R to apply this function and then convert # the output back to polars (see drawbacks in the "description" section # above) options(tidypolars_fallback_to_r = TRUE) mutate(test, x2 = mad(x)) options(tidypolars_fallback_to_r = FALSE)
##### Unknown arguments options(tidypolars_unknown_args = "warn") test <- polars::pl$DataFrame(x = c(2, 1, 5, 3, 1)) # The default is to warn the user mutate(test, x2 = sample(x, prob = 0.5)) # But one can make this stricter and throw an error when this happens options(tidypolars_unknown_args = "error") try(mutate(test, x2 = sample(x, prob = 0.5))) options(tidypolars_unknown_args = "warn") ##### Fallback to R test <- polars::pl$DataFrame(x = c(2, 1, 5, 3, 1)) # The default is to error because mad() isn't translated internally try(mutate(test, x2 = mad(x))) # But one can allow fallback to R to apply this function and then convert # the output back to polars (see drawbacks in the "description" section # above) options(tidypolars_fallback_to_r = TRUE) mutate(test, x2 = mad(x)) options(tidypolars_fallback_to_r = FALSE)
This duplicates rows according to a weighting variable (or expression). This
is the opposite of count()
.
## S3 method for class 'polars_data_frame' uncount(data, weights, ..., .remove = TRUE, .id = NULL) ## S3 method for class 'polars_lazy_frame' uncount(data, weights, ..., .remove = TRUE, .id = NULL)
## S3 method for class 'polars_data_frame' uncount(data, weights, ..., .remove = TRUE, .id = NULL) ## S3 method for class 'polars_lazy_frame' uncount(data, weights, ..., .remove = TRUE, .id = NULL)
data |
A Polars Data/LazyFrame |
weights |
A vector of weights. Evaluated in the context of |
... |
Dots which should be empty. |
.remove |
If |
.id |
Supply a string to create a new variable which gives a unique identifier for each created row. |
test <- polars::pl$DataFrame(x = c("a", "b"), y = 100:101, n = c(1, 2)) test uncount(test, n) uncount(test, n, .id = "id") # using constants uncount(test, 2) # using expressions uncount(test, 2 / n)
test <- polars::pl$DataFrame(x = c("a", "b"), y = 100:101, n = c(1, 2)) test uncount(test, n) uncount(test, n, .id = "id") # using constants uncount(test, 2) # using expressions uncount(test, 2 / n)
Unite multiple columns into one by pasting strings together
## S3 method for class 'polars_data_frame' unite(data, col, ..., sep = "_", remove = TRUE, na.rm = FALSE) ## S3 method for class 'polars_lazy_frame' unite(data, col, ..., sep = "_", remove = TRUE, na.rm = FALSE)
## S3 method for class 'polars_data_frame' unite(data, col, ..., sep = "_", remove = TRUE, na.rm = FALSE) ## S3 method for class 'polars_lazy_frame' unite(data, col, ..., sep = "_", remove = TRUE, na.rm = FALSE)
data |
A Polars Data/LazyFrame |
col |
The name of the new column, as a string or symbol. |
... |
Any expression accepted by |
sep |
Separator to use between values. |
remove |
If |
na.rm |
If |
test <- polars::pl$DataFrame( year = 2009:2011, month = 10:12, day = c(11L, 22L, 28L), name_day = c("Monday", "Thursday", "Wednesday") ) # By default, united columns are dropped unite(test, col = "full_date", year, month, day, sep = "-") unite(test, col = "full_date", year, month, day, sep = "-", remove = FALSE) test2 <- polars::pl$DataFrame( name = c("John", "Jack", "Thomas"), middlename = c("T.", NA, "F."), surname = c("Smith", "Thompson", "Jones") ) # By default, NA values are kept in the character output unite(test2, col = "full_name", everything(), sep = " ") unite(test2, col = "full_name", everything(), sep = " ", na.rm = TRUE)
test <- polars::pl$DataFrame( year = 2009:2011, month = 10:12, day = c(11L, 22L, 28L), name_day = c("Monday", "Thursday", "Wednesday") ) # By default, united columns are dropped unite(test, col = "full_date", year, month, day, sep = "-") unite(test, col = "full_date", year, month, day, sep = "-", remove = FALSE) test2 <- polars::pl$DataFrame( name = c("John", "Jack", "Thomas"), middlename = c("T.", NA, "F."), surname = c("Smith", "Thompson", "Jones") ) # By default, NA values are kept in the character output unite(test2, col = "full_name", everything(), sep = " ") unite(test2, col = "full_name", everything(), sep = " ", na.rm = TRUE)
Export data to CSV file(s)
write_csv_polars( .data, file, ..., include_bom = FALSE, include_header = TRUE, separator = ",", line_terminator = "\n", quote_char = "\"", batch_size = 1024, datetime_format = NULL, date_format = NULL, time_format = NULL, float_precision = NULL, null_value = "", quote_style = "necessary", quote, null_values )
write_csv_polars( .data, file, ..., include_bom = FALSE, include_header = TRUE, separator = ",", line_terminator = "\n", quote_char = "\"", batch_size = 1024, datetime_format = NULL, date_format = NULL, time_format = NULL, float_precision = NULL, null_value = "", quote_style = "necessary", quote, null_values )
.data |
A Polars DataFrame. |
file |
File path to which the result should be written. |
... |
Ignored. |
include_bom |
Whether to include UTF-8 BOM (byte order mark) in the CSV output. |
include_header |
Whether to include header in the CSV output. |
separator |
Separate CSV fields with this symbol. |
line_terminator |
String used to end each row. |
quote_char |
Byte to use as quoting character. |
batch_size |
Number of rows that will be processed per thread. |
datetime_format |
A format string, with the specifiers defined by the chrono Rust crate. If no format specified, the default fractional-second precision is inferred from the maximum timeunit found in the frame’s Datetime cols (if any). |
date_format |
A format string, with the specifiers defined by the chrono Rust crate. |
time_format |
A format string, with the specifiers defined by the chrono Rust crate. |
float_precision |
Number of decimal places to write, applied to both Float32 and Float64 datatypes. |
null_value |
A string representing null values (defaulting to the empty string). |
quote_style |
Determines the quoting strategy used.
|
quote |
|
null_values |
The input DataFrame.
dest <- tempfile(fileext = ".csv") mtcars |> as_polars_df() |> write_csv_polars(dest) read.csv(dest)
dest <- tempfile(fileext = ".csv") mtcars |> as_polars_df() |> write_csv_polars(dest) read.csv(dest)
Export data to IPC file(s)
write_ipc_polars( .data, file, compression = "uncompressed", ..., compat_level = "newest", future )
write_ipc_polars( .data, file, compression = "uncompressed", ..., compat_level = "newest", future )
.data |
A Polars DataFrame. |
file |
File path to which the result should be written. |
compression |
|
... |
Ignored. |
compat_level |
Determines the compatibility level when exporting Polars'
internal data structures. When specifying a new compatibility level, Polars
exports its internal data structures that might not be interpretable by other
Arrow implementations. The level can be specified as the name (e.g.,
|
future |
The input DataFrame.
Export data to JSON file(s)
write_json_polars(.data, file, ..., pretty = FALSE, row_oriented = FALSE)
write_json_polars(.data, file, ..., pretty = FALSE, row_oriented = FALSE)
.data |
A Polars DataFrame. |
file |
File path to which the result should be written. |
... |
Ignored. |
pretty |
|
row_oriented |
The input DataFrame.
dest <- tempfile(fileext = ".json") mtcars |> as_polars_df() |> write_json_polars(dest) jsonlite::fromJSON(dest)
dest <- tempfile(fileext = ".json") mtcars |> as_polars_df() |> write_json_polars(dest) jsonlite::fromJSON(dest)
Export data to NDJSON file(s)
write_ndjson_polars(.data, file)
write_ndjson_polars(.data, file)
.data |
A Polars DataFrame. |
file |
File path to which the result should be written. |
The input DataFrame.
dest <- tempfile(fileext = ".ndjson") mtcars |> as_polars_df() |> write_ndjson_polars(dest) jsonlite::stream_in(file(dest), verbose = FALSE)
dest <- tempfile(fileext = ".ndjson") mtcars |> as_polars_df() |> write_ndjson_polars(dest) jsonlite::stream_in(file(dest), verbose = FALSE)
Export data to Parquet file(s)
write_parquet_polars( .data, file, ..., compression = "zstd", compression_level = 3, statistics = TRUE, row_group_size = NULL, data_page_size = NULL, partition_by = NULL, partition_chunk_size_bytes = 4294967296 )
write_parquet_polars( .data, file, ..., compression = "zstd", compression_level = 3, statistics = TRUE, row_group_size = NULL, data_page_size = NULL, partition_by = NULL, partition_chunk_size_bytes = 4294967296 )
.data |
A Polars DataFrame. |
file |
File path to which the result should be written. |
... |
Ignored. |
compression |
The compression method. One of :
|
compression_level |
The level of compression to use (default is 3). Only
used if
|
statistics |
Whether to compute and write column statistics (default is
|
row_group_size |
Size of the row groups in number of rows. If |
data_page_size |
If |
partition_by |
Column(s) to partition by. A partitioned dataset will be written if this is specified. |
partition_chunk_size_bytes |
Approximate size to split DataFrames within a single partition when writing. Note this is calculated using the size of the DataFrame in memory - the size of the output file may differ depending on the file format / compression. |
The input DataFrame.
dest <- tempfile(fileext = ".parquet") mtcars |> as_polars_df() |> write_parquet_polars(dest) nanoparquet::read_parquet(dest)
dest <- tempfile(fileext = ".parquet") mtcars |> as_polars_df() |> write_parquet_polars(dest) nanoparquet::read_parquet(dest)