This is an automated email from the ASF dual-hosted git repository.
timsaucer pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion-python.git
The following commit(s) were added to refs/heads/main by this push:
new 56b7243 chore: deprecate `select_columns` (#911)
56b7243 is described below
commit 56b72438004965f36dd4ce7e14d62a533ab2026f
Author: Ion Koutsouris <[email protected]>
AuthorDate: Tue Oct 22 13:23:36 2024 +0200
chore: deprecate `select_columns` (#911)
* chore: deprecate select_columns
* chore: lint
* Update user document to use select instead of select_columns
* Update all tpch examples to use select instead of select_columns
---------
Co-authored-by: Tim Saucer <[email protected]>
---
.../user-guide/common-operations/select-and-filter.rst | 4 ++--
examples/import.py | 10 +++++-----
examples/tpch/convert_data_to_parquet.py | 2 +-
examples/tpch/q02_minimum_cost_supplier.py | 12 ++++++------
examples/tpch/q03_shipping_priority.py | 8 ++++----
examples/tpch/q04_order_priority_checking.py | 6 +++---
examples/tpch/q05_local_supplier_volume.py | 12 ++++++------
examples/tpch/q06_forecasting_revenue_change.py | 2 +-
examples/tpch/q07_volume_shipping.py | 10 +++++-----
examples/tpch/q08_market_share.py | 18 ++++++++----------
examples/tpch/q09_product_type_profit_measure.py | 14 ++++++--------
examples/tpch/q10_returned_item_reporting.py | 10 +++++-----
examples/tpch/q11_important_stock_identification.py | 8 ++++----
examples/tpch/q12_ship_mode_order_priority.py | 4 ++--
examples/tpch/q13_customer_distribution.py | 6 ++----
examples/tpch/q14_promotion_effect.py | 6 ++----
examples/tpch/q15_top_supplier.py | 6 +++---
examples/tpch/q16_part_supplier_relationship.py | 8 ++++----
examples/tpch/q17_small_quantity_order.py | 4 ++--
examples/tpch/q18_large_volume_customer.py | 8 ++++----
examples/tpch/q19_discounted_revenue.py | 4 ++--
examples/tpch/q20_potential_part_promotion.py | 14 ++++++--------
examples/tpch/q21_suppliers_kept_orders_waiting.py | 8 ++++----
examples/tpch/q22_global_sales_opportunity.py | 6 ++----
python/datafusion/dataframe.py | 3 +++
python/tests/test_dataframe.py | 14 ++++++--------
26 files changed, 98 insertions(+), 109 deletions(-)
diff --git a/docs/source/user-guide/common-operations/select-and-filter.rst
b/docs/source/user-guide/common-operations/select-and-filter.rst
index 92b4841..0759091 100644
--- a/docs/source/user-guide/common-operations/select-and-filter.rst
+++ b/docs/source/user-guide/common-operations/select-and-filter.rst
@@ -33,7 +33,7 @@ DataFusion can work with several file types, to start simple
we can use a subset
ctx = SessionContext()
df = ctx.read_parquet("yellow_trip_data.parquet")
- df.select_columns("trip_distance", "passenger_count")
+ df.select("trip_distance", "passenger_count")
For mathematical or logical operations use :py:func:`~datafusion.col` to
select columns, and give meaningful names to the resulting
operations using :py:func:`~datafusion.expr.Expr.alias`
@@ -48,7 +48,7 @@ operations using :py:func:`~datafusion.expr.Expr.alias`
Please be aware that all identifiers are effectively made lower-case in
SQL, so if your file has capital letters
(ex: Name) you must put your column name in double quotes or the selection
won’t work. As an alternative for simple
- column selection use
:py:func:`~datafusion.dataframe.DataFrame.select_columns` without double quotes
+ column selection use :py:func:`~datafusion.dataframe.DataFrame.select`
without double quotes
For selecting columns with capital letters use ``'"VendorID"'``
diff --git a/examples/import.py b/examples/import.py
index cd965cb..c9d2e8c 100644
--- a/examples/import.py
+++ b/examples/import.py
@@ -28,7 +28,7 @@ ctx = datafusion.SessionContext()
# The dictionary keys represent column names and the dictionary values
# represent column values
df = ctx.from_pydict({"a": [1, 2, 3], "b": [4, 5, 6]})
-assert type(df) == datafusion.DataFrame
+assert type(df) is datafusion.DataFrame
# Dataframe:
# +---+---+
# | a | b |
@@ -40,19 +40,19 @@ assert type(df) == datafusion.DataFrame
# Create a datafusion DataFrame from a Python list of rows
df = ctx.from_pylist([{"a": 1, "b": 4}, {"a": 2, "b": 5}, {"a": 3, "b": 6}])
-assert type(df) == datafusion.DataFrame
+assert type(df) is datafusion.DataFrame
# Convert pandas DataFrame to datafusion DataFrame
pandas_df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
df = ctx.from_pandas(pandas_df)
-assert type(df) == datafusion.DataFrame
+assert type(df) is datafusion.DataFrame
# Convert polars DataFrame to datafusion DataFrame
polars_df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
df = ctx.from_polars(polars_df)
-assert type(df) == datafusion.DataFrame
+assert type(df) is datafusion.DataFrame
# Convert Arrow Table to datafusion DataFrame
arrow_table = pa.Table.from_pydict({"a": [1, 2, 3], "b": [4, 5, 6]})
df = ctx.from_arrow(arrow_table)
-assert type(df) == datafusion.DataFrame
+assert type(df) is datafusion.DataFrame
diff --git a/examples/tpch/convert_data_to_parquet.py
b/examples/tpch/convert_data_to_parquet.py
index a8091a7..cb0b2f0 100644
--- a/examples/tpch/convert_data_to_parquet.py
+++ b/examples/tpch/convert_data_to_parquet.py
@@ -138,6 +138,6 @@ for filename, curr_schema in all_schemas.items():
df = ctx.read_csv(source_file, schema=schema, has_header=False,
delimiter="|")
- df = df.select_columns(*output_cols)
+ df = df.select(*output_cols)
df.write_parquet(dest_file, compression="snappy")
diff --git a/examples/tpch/q02_minimum_cost_supplier.py
b/examples/tpch/q02_minimum_cost_supplier.py
index 2171a20..2440fda 100644
--- a/examples/tpch/q02_minimum_cost_supplier.py
+++ b/examples/tpch/q02_minimum_cost_supplier.py
@@ -43,10 +43,10 @@ REGION_OF_INTEREST = "EUROPE"
ctx = SessionContext()
-df_part = ctx.read_parquet(get_data_path("part.parquet")).select_columns(
+df_part = ctx.read_parquet(get_data_path("part.parquet")).select(
"p_partkey", "p_mfgr", "p_type", "p_size"
)
-df_supplier =
ctx.read_parquet(get_data_path("supplier.parquet")).select_columns(
+df_supplier = ctx.read_parquet(get_data_path("supplier.parquet")).select(
"s_acctbal",
"s_name",
"s_address",
@@ -55,13 +55,13 @@ df_supplier =
ctx.read_parquet(get_data_path("supplier.parquet")).select_columns
"s_nationkey",
"s_suppkey",
)
-df_partsupp =
ctx.read_parquet(get_data_path("partsupp.parquet")).select_columns(
+df_partsupp = ctx.read_parquet(get_data_path("partsupp.parquet")).select(
"ps_partkey", "ps_suppkey", "ps_supplycost"
)
-df_nation = ctx.read_parquet(get_data_path("nation.parquet")).select_columns(
+df_nation = ctx.read_parquet(get_data_path("nation.parquet")).select(
"n_nationkey", "n_regionkey", "n_name"
)
-df_region = ctx.read_parquet(get_data_path("region.parquet")).select_columns(
+df_region = ctx.read_parquet(get_data_path("region.parquet")).select(
"r_regionkey", "r_name"
)
@@ -115,7 +115,7 @@ df = df.join(df_part, (["ps_partkey"], ["p_partkey"]),
how="inner")
# From the problem statement, these are the values we wish to output
-df = df.select_columns(
+df = df.select(
"s_acctbal",
"s_name",
"n_name",
diff --git a/examples/tpch/q03_shipping_priority.py
b/examples/tpch/q03_shipping_priority.py
index 6a4886d..c4e8f46 100644
--- a/examples/tpch/q03_shipping_priority.py
+++ b/examples/tpch/q03_shipping_priority.py
@@ -37,13 +37,13 @@ DATE_OF_INTEREST = "1995-03-15"
ctx = SessionContext()
-df_customer =
ctx.read_parquet(get_data_path("customer.parquet")).select_columns(
+df_customer = ctx.read_parquet(get_data_path("customer.parquet")).select(
"c_mktsegment", "c_custkey"
)
-df_orders = ctx.read_parquet(get_data_path("orders.parquet")).select_columns(
+df_orders = ctx.read_parquet(get_data_path("orders.parquet")).select(
"o_orderdate", "o_shippriority", "o_custkey", "o_orderkey"
)
-df_lineitem =
ctx.read_parquet(get_data_path("lineitem.parquet")).select_columns(
+df_lineitem = ctx.read_parquet(get_data_path("lineitem.parquet")).select(
"l_orderkey", "l_extendedprice", "l_discount", "l_shipdate"
)
@@ -80,7 +80,7 @@ df = df.limit(10)
# Change the order that the columns are reported in just to match the spec
-df = df.select_columns("l_orderkey", "revenue", "o_orderdate",
"o_shippriority")
+df = df.select("l_orderkey", "revenue", "o_orderdate", "o_shippriority")
# Show result
diff --git a/examples/tpch/q04_order_priority_checking.py
b/examples/tpch/q04_order_priority_checking.py
index 77c3bd4..f10b74d 100644
--- a/examples/tpch/q04_order_priority_checking.py
+++ b/examples/tpch/q04_order_priority_checking.py
@@ -39,10 +39,10 @@ DATE_OF_INTEREST = "1993-07-01"
ctx = SessionContext()
-df_orders = ctx.read_parquet(get_data_path("orders.parquet")).select_columns(
+df_orders = ctx.read_parquet(get_data_path("orders.parquet")).select(
"o_orderdate", "o_orderpriority", "o_orderkey"
)
-df_lineitem =
ctx.read_parquet(get_data_path("lineitem.parquet")).select_columns(
+df_lineitem = ctx.read_parquet(get_data_path("lineitem.parquet")).select(
"l_orderkey", "l_commitdate", "l_receiptdate"
)
@@ -54,7 +54,7 @@ interval = pa.scalar((0, INTERVAL_DAYS, 0),
type=pa.month_day_nano_interval())
# Limit results to cases where commitment date before receipt date
# Aggregate the results so we only get one row to join with the order table.
# Alternately, and likely more idiomatic is instead of `.aggregate` you could
-# do `.select_columns("l_orderkey").distinct()`. The goal here is to show
+# do `.select("l_orderkey").distinct()`. The goal here is to show
# multiple examples of how to use Data Fusion.
df_lineitem = df_lineitem.filter(col("l_commitdate") <
col("l_receiptdate")).aggregate(
[col("l_orderkey")], []
diff --git a/examples/tpch/q05_local_supplier_volume.py
b/examples/tpch/q05_local_supplier_volume.py
index f17f600..2a83d2d 100644
--- a/examples/tpch/q05_local_supplier_volume.py
+++ b/examples/tpch/q05_local_supplier_volume.py
@@ -47,22 +47,22 @@ interval = pa.scalar((0, INTERVAL_DAYS, 0),
type=pa.month_day_nano_interval())
ctx = SessionContext()
-df_customer =
ctx.read_parquet(get_data_path("customer.parquet")).select_columns(
+df_customer = ctx.read_parquet(get_data_path("customer.parquet")).select(
"c_custkey", "c_nationkey"
)
-df_orders = ctx.read_parquet(get_data_path("orders.parquet")).select_columns(
+df_orders = ctx.read_parquet(get_data_path("orders.parquet")).select(
"o_custkey", "o_orderkey", "o_orderdate"
)
-df_lineitem =
ctx.read_parquet(get_data_path("lineitem.parquet")).select_columns(
+df_lineitem = ctx.read_parquet(get_data_path("lineitem.parquet")).select(
"l_orderkey", "l_suppkey", "l_extendedprice", "l_discount"
)
-df_supplier =
ctx.read_parquet(get_data_path("supplier.parquet")).select_columns(
+df_supplier = ctx.read_parquet(get_data_path("supplier.parquet")).select(
"s_suppkey", "s_nationkey"
)
-df_nation = ctx.read_parquet(get_data_path("nation.parquet")).select_columns(
+df_nation = ctx.read_parquet(get_data_path("nation.parquet")).select(
"n_nationkey", "n_regionkey", "n_name"
)
-df_region = ctx.read_parquet(get_data_path("region.parquet")).select_columns(
+df_region = ctx.read_parquet(get_data_path("region.parquet")).select(
"r_regionkey", "r_name"
)
diff --git a/examples/tpch/q06_forecasting_revenue_change.py
b/examples/tpch/q06_forecasting_revenue_change.py
index 3beb9eb..eaf9b0c 100644
--- a/examples/tpch/q06_forecasting_revenue_change.py
+++ b/examples/tpch/q06_forecasting_revenue_change.py
@@ -51,7 +51,7 @@ interval = pa.scalar((0, INTERVAL_DAYS, 0),
type=pa.month_day_nano_interval())
ctx = SessionContext()
-df_lineitem =
ctx.read_parquet(get_data_path("lineitem.parquet")).select_columns(
+df_lineitem = ctx.read_parquet(get_data_path("lineitem.parquet")).select(
"l_shipdate", "l_quantity", "l_extendedprice", "l_discount"
)
diff --git a/examples/tpch/q07_volume_shipping.py
b/examples/tpch/q07_volume_shipping.py
index 44c605a..a1d7d81 100644
--- a/examples/tpch/q07_volume_shipping.py
+++ b/examples/tpch/q07_volume_shipping.py
@@ -49,19 +49,19 @@ end_date = lit(datetime.strptime(END_DATE,
"%Y-%m-%d").date())
ctx = SessionContext()
-df_supplier =
ctx.read_parquet(get_data_path("supplier.parquet")).select_columns(
+df_supplier = ctx.read_parquet(get_data_path("supplier.parquet")).select(
"s_suppkey", "s_nationkey"
)
-df_lineitem =
ctx.read_parquet(get_data_path("lineitem.parquet")).select_columns(
+df_lineitem = ctx.read_parquet(get_data_path("lineitem.parquet")).select(
"l_shipdate", "l_extendedprice", "l_discount", "l_suppkey", "l_orderkey"
)
-df_orders = ctx.read_parquet(get_data_path("orders.parquet")).select_columns(
+df_orders = ctx.read_parquet(get_data_path("orders.parquet")).select(
"o_orderkey", "o_custkey"
)
-df_customer =
ctx.read_parquet(get_data_path("customer.parquet")).select_columns(
+df_customer = ctx.read_parquet(get_data_path("customer.parquet")).select(
"c_custkey", "c_nationkey"
)
-df_nation = ctx.read_parquet(get_data_path("nation.parquet")).select_columns(
+df_nation = ctx.read_parquet(get_data_path("nation.parquet")).select(
"n_nationkey", "n_name"
)
diff --git a/examples/tpch/q08_market_share.py
b/examples/tpch/q08_market_share.py
index cd6bc1f..95fc0a8 100644
--- a/examples/tpch/q08_market_share.py
+++ b/examples/tpch/q08_market_share.py
@@ -47,25 +47,23 @@ end_date = lit(datetime.strptime(END_DATE,
"%Y-%m-%d").date())
ctx = SessionContext()
-df_part = ctx.read_parquet(get_data_path("part.parquet")).select_columns(
- "p_partkey", "p_type"
-)
-df_supplier =
ctx.read_parquet(get_data_path("supplier.parquet")).select_columns(
+df_part = ctx.read_parquet(get_data_path("part.parquet")).select("p_partkey",
"p_type")
+df_supplier = ctx.read_parquet(get_data_path("supplier.parquet")).select(
"s_suppkey", "s_nationkey"
)
-df_lineitem =
ctx.read_parquet(get_data_path("lineitem.parquet")).select_columns(
+df_lineitem = ctx.read_parquet(get_data_path("lineitem.parquet")).select(
"l_partkey", "l_extendedprice", "l_discount", "l_suppkey", "l_orderkey"
)
-df_orders = ctx.read_parquet(get_data_path("orders.parquet")).select_columns(
+df_orders = ctx.read_parquet(get_data_path("orders.parquet")).select(
"o_orderkey", "o_custkey", "o_orderdate"
)
-df_customer =
ctx.read_parquet(get_data_path("customer.parquet")).select_columns(
+df_customer = ctx.read_parquet(get_data_path("customer.parquet")).select(
"c_custkey", "c_nationkey"
)
-df_nation = ctx.read_parquet(get_data_path("nation.parquet")).select_columns(
+df_nation = ctx.read_parquet(get_data_path("nation.parquet")).select(
"n_nationkey", "n_name", "n_regionkey"
)
-df_region = ctx.read_parquet(get_data_path("region.parquet")).select_columns(
+df_region = ctx.read_parquet(get_data_path("region.parquet")).select(
"r_regionkey", "r_name"
)
@@ -133,7 +131,7 @@ df_national_suppliers = df_national_suppliers.join(
# When we join to the customer dataframe, we don't want to confuse other
columns, so only
# select the supplier key that we need
-df_national_suppliers = df_national_suppliers.select_columns("s_suppkey")
+df_national_suppliers = df_national_suppliers.select("s_suppkey")
# Part 3: Combine suppliers and customers and compute the market share
diff --git a/examples/tpch/q09_product_type_profit_measure.py
b/examples/tpch/q09_product_type_profit_measure.py
index b4a7369..0295d30 100644
--- a/examples/tpch/q09_product_type_profit_measure.py
+++ b/examples/tpch/q09_product_type_profit_measure.py
@@ -39,16 +39,14 @@ part_color = lit("green")
ctx = SessionContext()
-df_part = ctx.read_parquet(get_data_path("part.parquet")).select_columns(
- "p_partkey", "p_name"
-)
-df_supplier =
ctx.read_parquet(get_data_path("supplier.parquet")).select_columns(
+df_part = ctx.read_parquet(get_data_path("part.parquet")).select("p_partkey",
"p_name")
+df_supplier = ctx.read_parquet(get_data_path("supplier.parquet")).select(
"s_suppkey", "s_nationkey"
)
-df_partsupp =
ctx.read_parquet(get_data_path("partsupp.parquet")).select_columns(
+df_partsupp = ctx.read_parquet(get_data_path("partsupp.parquet")).select(
"ps_suppkey", "ps_partkey", "ps_supplycost"
)
-df_lineitem =
ctx.read_parquet(get_data_path("lineitem.parquet")).select_columns(
+df_lineitem = ctx.read_parquet(get_data_path("lineitem.parquet")).select(
"l_partkey",
"l_extendedprice",
"l_discount",
@@ -56,10 +54,10 @@ df_lineitem =
ctx.read_parquet(get_data_path("lineitem.parquet")).select_columns
"l_orderkey",
"l_quantity",
)
-df_orders = ctx.read_parquet(get_data_path("orders.parquet")).select_columns(
+df_orders = ctx.read_parquet(get_data_path("orders.parquet")).select(
"o_orderkey", "o_custkey", "o_orderdate"
)
-df_nation = ctx.read_parquet(get_data_path("nation.parquet")).select_columns(
+df_nation = ctx.read_parquet(get_data_path("nation.parquet")).select(
"n_nationkey", "n_name", "n_regionkey"
)
diff --git a/examples/tpch/q10_returned_item_reporting.py
b/examples/tpch/q10_returned_item_reporting.py
index 78327c3..25f81b2 100644
--- a/examples/tpch/q10_returned_item_reporting.py
+++ b/examples/tpch/q10_returned_item_reporting.py
@@ -44,7 +44,7 @@ interval_one_quarter = lit(pa.scalar((0, 92, 0),
type=pa.month_day_nano_interval
ctx = SessionContext()
-df_customer =
ctx.read_parquet(get_data_path("customer.parquet")).select_columns(
+df_customer = ctx.read_parquet(get_data_path("customer.parquet")).select(
"c_custkey",
"c_nationkey",
"c_name",
@@ -53,13 +53,13 @@ df_customer =
ctx.read_parquet(get_data_path("customer.parquet")).select_columns
"c_phone",
"c_comment",
)
-df_lineitem =
ctx.read_parquet(get_data_path("lineitem.parquet")).select_columns(
+df_lineitem = ctx.read_parquet(get_data_path("lineitem.parquet")).select(
"l_extendedprice", "l_discount", "l_orderkey", "l_returnflag"
)
-df_orders = ctx.read_parquet(get_data_path("orders.parquet")).select_columns(
+df_orders = ctx.read_parquet(get_data_path("orders.parquet")).select(
"o_orderkey", "o_custkey", "o_orderdate"
)
-df_nation = ctx.read_parquet(get_data_path("nation.parquet")).select_columns(
+df_nation = ctx.read_parquet(get_data_path("nation.parquet")).select(
"n_nationkey", "n_name", "n_regionkey"
)
@@ -87,7 +87,7 @@ df = df.join(df_customer, (["o_custkey"], ["c_custkey"]),
how="inner")
df = df.join(df_nation, (["c_nationkey"], ["n_nationkey"]), how="inner")
# These are the columns the problem statement requires
-df = df.select_columns(
+df = df.select(
"c_custkey",
"c_name",
"revenue",
diff --git a/examples/tpch/q11_important_stock_identification.py
b/examples/tpch/q11_important_stock_identification.py
index 391eb45..86ff229 100644
--- a/examples/tpch/q11_important_stock_identification.py
+++ b/examples/tpch/q11_important_stock_identification.py
@@ -37,13 +37,13 @@ FRACTION = 0.0001
ctx = SessionContext()
-df_supplier =
ctx.read_parquet(get_data_path("supplier.parquet")).select_columns(
+df_supplier = ctx.read_parquet(get_data_path("supplier.parquet")).select(
"s_suppkey", "s_nationkey"
)
-df_partsupp =
ctx.read_parquet(get_data_path("partsupp.parquet")).select_columns(
+df_partsupp = ctx.read_parquet(get_data_path("partsupp.parquet")).select(
"ps_supplycost", "ps_availqty", "ps_suppkey", "ps_partkey"
)
-df_nation = ctx.read_parquet(get_data_path("nation.parquet")).select_columns(
+df_nation = ctx.read_parquet(get_data_path("nation.parquet")).select(
"n_nationkey", "n_name"
)
@@ -75,7 +75,7 @@ df = df.with_column(
df = df.filter(col("value") / col("total_value") >= lit(FRACTION))
# We only need to report on these two columns
-df = df.select_columns("ps_partkey", "value")
+df = df.select("ps_partkey", "value")
# Sort in descending order of value
df = df.sort(col("value").sort(ascending=False))
diff --git a/examples/tpch/q12_ship_mode_order_priority.py
b/examples/tpch/q12_ship_mode_order_priority.py
index 150870c..c3fc0d2 100644
--- a/examples/tpch/q12_ship_mode_order_priority.py
+++ b/examples/tpch/q12_ship_mode_order_priority.py
@@ -42,10 +42,10 @@ DATE_OF_INTEREST = "1994-01-01"
ctx = SessionContext()
-df_orders = ctx.read_parquet(get_data_path("orders.parquet")).select_columns(
+df_orders = ctx.read_parquet(get_data_path("orders.parquet")).select(
"o_orderkey", "o_orderpriority"
)
-df_lineitem =
ctx.read_parquet(get_data_path("lineitem.parquet")).select_columns(
+df_lineitem = ctx.read_parquet(get_data_path("lineitem.parquet")).select(
"l_orderkey", "l_shipmode", "l_commitdate", "l_shipdate", "l_receiptdate"
)
diff --git a/examples/tpch/q13_customer_distribution.py
b/examples/tpch/q13_customer_distribution.py
index bc0a5bd..f8b6c13 100644
--- a/examples/tpch/q13_customer_distribution.py
+++ b/examples/tpch/q13_customer_distribution.py
@@ -38,12 +38,10 @@ WORD_2 = "requests"
ctx = SessionContext()
-df_orders = ctx.read_parquet(get_data_path("orders.parquet")).select_columns(
+df_orders = ctx.read_parquet(get_data_path("orders.parquet")).select(
"o_custkey", "o_comment"
)
-df_customer =
ctx.read_parquet(get_data_path("customer.parquet")).select_columns(
- "c_custkey"
-)
+df_customer =
ctx.read_parquet(get_data_path("customer.parquet")).select("c_custkey")
# Use a regex to remove special cases
df_orders = df_orders.filter(
diff --git a/examples/tpch/q14_promotion_effect.py
b/examples/tpch/q14_promotion_effect.py
index 8cb1e4c..8224136 100644
--- a/examples/tpch/q14_promotion_effect.py
+++ b/examples/tpch/q14_promotion_effect.py
@@ -41,12 +41,10 @@ interval_one_month = lit(pa.scalar((0, 30, 0),
type=pa.month_day_nano_interval()
ctx = SessionContext()
-df_lineitem =
ctx.read_parquet(get_data_path("lineitem.parquet")).select_columns(
+df_lineitem = ctx.read_parquet(get_data_path("lineitem.parquet")).select(
"l_partkey", "l_shipdate", "l_extendedprice", "l_discount"
)
-df_part = ctx.read_parquet(get_data_path("part.parquet")).select_columns(
- "p_partkey", "p_type"
-)
+df_part = ctx.read_parquet(get_data_path("part.parquet")).select("p_partkey",
"p_type")
# Check part type begins with PROMO
diff --git a/examples/tpch/q15_top_supplier.py
b/examples/tpch/q15_top_supplier.py
index aa76093..44d5dd9 100644
--- a/examples/tpch/q15_top_supplier.py
+++ b/examples/tpch/q15_top_supplier.py
@@ -41,10 +41,10 @@ interval_3_months = lit(pa.scalar((0, 91, 0),
type=pa.month_day_nano_interval())
ctx = SessionContext()
-df_lineitem =
ctx.read_parquet(get_data_path("lineitem.parquet")).select_columns(
+df_lineitem = ctx.read_parquet(get_data_path("lineitem.parquet")).select(
"l_suppkey", "l_shipdate", "l_extendedprice", "l_discount"
)
-df_supplier =
ctx.read_parquet(get_data_path("supplier.parquet")).select_columns(
+df_supplier = ctx.read_parquet(get_data_path("supplier.parquet")).select(
"s_suppkey",
"s_name",
"s_address",
@@ -79,7 +79,7 @@ df = df.filter(col("total_revenue") == col("max_revenue"))
df = df.join(df_supplier, (["l_suppkey"], ["s_suppkey"]), "inner")
# Return only the columns requested
-df = df.select_columns("s_suppkey", "s_name", "s_address", "s_phone",
"total_revenue")
+df = df.select("s_suppkey", "s_name", "s_address", "s_phone", "total_revenue")
# If we have more than one, sort by supplier number (suppkey)
df = df.sort(col("s_suppkey").sort())
diff --git a/examples/tpch/q16_part_supplier_relationship.py
b/examples/tpch/q16_part_supplier_relationship.py
index fdcb5b4..cbdd998 100644
--- a/examples/tpch/q16_part_supplier_relationship.py
+++ b/examples/tpch/q16_part_supplier_relationship.py
@@ -40,13 +40,13 @@ SIZES_OF_INTEREST = [49, 14, 23, 45, 19, 3, 36, 9]
ctx = SessionContext()
-df_part = ctx.read_parquet(get_data_path("part.parquet")).select_columns(
+df_part = ctx.read_parquet(get_data_path("part.parquet")).select(
"p_partkey", "p_brand", "p_type", "p_size"
)
-df_partsupp =
ctx.read_parquet(get_data_path("partsupp.parquet")).select_columns(
+df_partsupp = ctx.read_parquet(get_data_path("partsupp.parquet")).select(
"ps_suppkey", "ps_partkey"
)
-df_supplier =
ctx.read_parquet(get_data_path("supplier.parquet")).select_columns(
+df_supplier = ctx.read_parquet(get_data_path("supplier.parquet")).select(
"s_suppkey", "s_comment"
)
@@ -75,7 +75,7 @@ df_part = df_part.filter(~F.array_position(p_sizes,
col("p_size")).is_null())
df = df_part.join(df_partsupp, (["p_partkey"], ["ps_partkey"]), "inner")
-df = df.select_columns("p_brand", "p_type", "p_size", "ps_suppkey").distinct()
+df = df.select("p_brand", "p_type", "p_size", "ps_suppkey").distinct()
df = df.aggregate(
[col("p_brand"), col("p_type"), col("p_size")],
diff --git a/examples/tpch/q17_small_quantity_order.py
b/examples/tpch/q17_small_quantity_order.py
index e0ee8bb..ff49427 100644
--- a/examples/tpch/q17_small_quantity_order.py
+++ b/examples/tpch/q17_small_quantity_order.py
@@ -38,10 +38,10 @@ CONTAINER = "MED BOX"
ctx = SessionContext()
-df_part = ctx.read_parquet(get_data_path("part.parquet")).select_columns(
+df_part = ctx.read_parquet(get_data_path("part.parquet")).select(
"p_partkey", "p_brand", "p_container"
)
-df_lineitem =
ctx.read_parquet(get_data_path("lineitem.parquet")).select_columns(
+df_lineitem = ctx.read_parquet(get_data_path("lineitem.parquet")).select(
"l_partkey", "l_quantity", "l_extendedprice"
)
diff --git a/examples/tpch/q18_large_volume_customer.py
b/examples/tpch/q18_large_volume_customer.py
index 10c5f6e..4976154 100644
--- a/examples/tpch/q18_large_volume_customer.py
+++ b/examples/tpch/q18_large_volume_customer.py
@@ -35,13 +35,13 @@ QUANTITY = 300
ctx = SessionContext()
-df_customer =
ctx.read_parquet(get_data_path("customer.parquet")).select_columns(
+df_customer = ctx.read_parquet(get_data_path("customer.parquet")).select(
"c_custkey", "c_name"
)
-df_orders = ctx.read_parquet(get_data_path("orders.parquet")).select_columns(
+df_orders = ctx.read_parquet(get_data_path("orders.parquet")).select(
"o_orderkey", "o_custkey", "o_orderdate", "o_totalprice"
)
-df_lineitem =
ctx.read_parquet(get_data_path("lineitem.parquet")).select_columns(
+df_lineitem = ctx.read_parquet(get_data_path("lineitem.parquet")).select(
"l_orderkey", "l_quantity", "l_extendedprice"
)
@@ -57,7 +57,7 @@ df = df.filter(col("total_quantity") > lit(QUANTITY))
df = df.join(df_orders, (["l_orderkey"], ["o_orderkey"]), "inner")
df = df.join(df_customer, (["o_custkey"], ["c_custkey"]), "inner")
-df = df.select_columns(
+df = df.select(
"c_name", "c_custkey", "o_orderkey", "o_orderdate", "o_totalprice",
"total_quantity"
)
diff --git a/examples/tpch/q19_discounted_revenue.py
b/examples/tpch/q19_discounted_revenue.py
index b15cd98..c2fe257 100644
--- a/examples/tpch/q19_discounted_revenue.py
+++ b/examples/tpch/q19_discounted_revenue.py
@@ -52,10 +52,10 @@ items_of_interest = {
ctx = SessionContext()
-df_part = ctx.read_parquet(get_data_path("part.parquet")).select_columns(
+df_part = ctx.read_parquet(get_data_path("part.parquet")).select(
"p_partkey", "p_brand", "p_container", "p_size"
)
-df_lineitem =
ctx.read_parquet(get_data_path("lineitem.parquet")).select_columns(
+df_lineitem = ctx.read_parquet(get_data_path("lineitem.parquet")).select(
"l_partkey",
"l_quantity",
"l_shipmode",
diff --git a/examples/tpch/q20_potential_part_promotion.py
b/examples/tpch/q20_potential_part_promotion.py
index 4ced7aa..3a0edb1 100644
--- a/examples/tpch/q20_potential_part_promotion.py
+++ b/examples/tpch/q20_potential_part_promotion.py
@@ -40,19 +40,17 @@ NATION_OF_INTEREST = "CANADA"
ctx = SessionContext()
-df_part = ctx.read_parquet(get_data_path("part.parquet")).select_columns(
- "p_partkey", "p_name"
-)
-df_lineitem =
ctx.read_parquet(get_data_path("lineitem.parquet")).select_columns(
+df_part = ctx.read_parquet(get_data_path("part.parquet")).select("p_partkey",
"p_name")
+df_lineitem = ctx.read_parquet(get_data_path("lineitem.parquet")).select(
"l_shipdate", "l_partkey", "l_suppkey", "l_quantity"
)
-df_partsupp =
ctx.read_parquet(get_data_path("partsupp.parquet")).select_columns(
+df_partsupp = ctx.read_parquet(get_data_path("partsupp.parquet")).select(
"ps_partkey", "ps_suppkey", "ps_availqty"
)
-df_supplier =
ctx.read_parquet(get_data_path("supplier.parquet")).select_columns(
+df_supplier = ctx.read_parquet(get_data_path("supplier.parquet")).select(
"s_suppkey", "s_address", "s_name", "s_nationkey"
)
-df_nation = ctx.read_parquet(get_data_path("nation.parquet")).select_columns(
+df_nation = ctx.read_parquet(get_data_path("nation.parquet")).select(
"n_nationkey", "n_name"
)
@@ -91,7 +89,7 @@ df = df.join(df_supplier, (["ps_suppkey"], ["s_suppkey"]),
"inner")
df = df.join(df_nation, (["s_nationkey"], ["n_nationkey"]), "inner")
# Restrict to the requested data per the problem statement
-df = df.select_columns("s_name", "s_address").distinct()
+df = df.select("s_name", "s_address").distinct()
df = df.sort(col("s_name").sort())
diff --git a/examples/tpch/q21_suppliers_kept_orders_waiting.py
b/examples/tpch/q21_suppliers_kept_orders_waiting.py
index 6b1679e..d3d57ac 100644
--- a/examples/tpch/q21_suppliers_kept_orders_waiting.py
+++ b/examples/tpch/q21_suppliers_kept_orders_waiting.py
@@ -35,16 +35,16 @@ NATION_OF_INTEREST = "SAUDI ARABIA"
ctx = SessionContext()
-df_orders = ctx.read_parquet(get_data_path("orders.parquet")).select_columns(
+df_orders = ctx.read_parquet(get_data_path("orders.parquet")).select(
"o_orderkey", "o_orderstatus"
)
-df_lineitem =
ctx.read_parquet(get_data_path("lineitem.parquet")).select_columns(
+df_lineitem = ctx.read_parquet(get_data_path("lineitem.parquet")).select(
"l_orderkey", "l_receiptdate", "l_commitdate", "l_suppkey"
)
-df_supplier =
ctx.read_parquet(get_data_path("supplier.parquet")).select_columns(
+df_supplier = ctx.read_parquet(get_data_path("supplier.parquet")).select(
"s_suppkey", "s_name", "s_nationkey"
)
-df_nation = ctx.read_parquet(get_data_path("nation.parquet")).select_columns(
+df_nation = ctx.read_parquet(get_data_path("nation.parquet")).select(
"n_nationkey", "n_name"
)
diff --git a/examples/tpch/q22_global_sales_opportunity.py
b/examples/tpch/q22_global_sales_opportunity.py
index 41fd5de..e6660e6 100644
--- a/examples/tpch/q22_global_sales_opportunity.py
+++ b/examples/tpch/q22_global_sales_opportunity.py
@@ -35,12 +35,10 @@ NATION_CODES = [13, 31, 23, 29, 30, 18, 17]
ctx = SessionContext()
-df_customer =
ctx.read_parquet(get_data_path("customer.parquet")).select_columns(
+df_customer = ctx.read_parquet(get_data_path("customer.parquet")).select(
"c_phone", "c_acctbal", "c_custkey"
)
-df_orders = ctx.read_parquet(get_data_path("orders.parquet")).select_columns(
- "o_custkey"
-)
+df_orders =
ctx.read_parquet(get_data_path("orders.parquet")).select("o_custkey")
# The nation code is a two digit number, but we need to convert it to a string
literal
nation_codes = F.make_array(*[lit(str(n)) for n in NATION_CODES])
diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py
index e4f8073..e59f00d 100644
--- a/python/datafusion/dataframe.py
+++ b/python/datafusion/dataframe.py
@@ -98,6 +98,9 @@ class DataFrame:
"""
return self.df.schema()
+ @deprecated(
+ "select_columns() is deprecated. Use :py:meth:`~DataFrame.select`
instead"
+ )
def select_columns(self, *args: str) -> DataFrame:
"""Filter the DataFrame by columns.
diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py
index d73f5eb..7b20e9e 100644
--- a/python/tests/test_dataframe.py
+++ b/python/tests/test_dataframe.py
@@ -103,30 +103,28 @@ def partitioned_df():
def test_select(df):
- df = df.select(
+ df_1 = df.select(
column("a") + column("b"),
column("a") - column("b"),
)
# execute and collect the first (and only) batch
- result = df.collect()[0]
+ result = df_1.collect()[0]
assert result.column(0) == pa.array([5, 7, 9])
assert result.column(1) == pa.array([-3, -3, -3])
-
-def test_select_mixed_expr_string(df):
- df = df.select_columns(column("b"), "a")
+ df_2 = df.select("b", "a")
# execute and collect the first (and only) batch
- result = df.collect()[0]
+ result = df_2.collect()[0]
assert result.column(0) == pa.array([4, 5, 6])
assert result.column(1) == pa.array([1, 2, 3])
-def test_select_columns(df):
- df = df.select_columns("b", "a")
+def test_select_mixed_expr_string(df):
+ df = df.select(column("b"), "a")
# execute and collect the first (and only) batch
result = df.collect()[0]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]